From 956afcd73cd695e6fff7d8abaddf3a3dc3497e3b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 29 Aug 2024 18:23:58 -0400 Subject: [PATCH 001/128] Implement TPC-H data loader --- configs/tpch_test.conf | 19 ++++ data/__init__.py | 1 + data/tpch_loader.py | 198 +++++++++++++++++++++++++++++++++++++++++ main.py | 14 ++- rpc/tpch_utils.py | 86 ++---------------- 5 files changed, 237 insertions(+), 81 deletions(-) create mode 100644 configs/tpch_test.conf create mode 100644 data/tpch_loader.py diff --git a/configs/tpch_test.conf b/configs/tpch_test.conf new file mode 100644 index 00000000..6da9d431 --- /dev/null +++ b/configs/tpch_test.conf @@ -0,0 +1,19 @@ +# Output configs. +--log=./tpch_test.log +--log_level=debug +--csv=./tpch_test.csv + +# Task configs. +--runtime_variance=0 + +# Scheduler configs. +--scheduler=EDF +--scheduler_runtime=0 +--enforce_deadlines + +# Execution mode configs. +--execution_mode=replay +--replay_trace=tpch + +# TPCH flags +--tpch_query_dag_spec=profiles/workload/tpch/queries.yaml diff --git a/data/__init__.py b/data/__init__.py index ec2c2986..8c185fa6 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -7,6 +7,7 @@ from .task_loader_benchmark import TaskLoaderBenchmark from .task_loader_pylot import TaskLoaderPylot from .task_loader_synthetic import TaskLoaderSynthetic +from .tpch_loader import TpchLoader from .worker_loader import WorkerLoader from .worker_loader_benchmark import WorkerLoaderBenchmark from .workload_loader import WorkloadLoader diff --git a/data/tpch_loader.py b/data/tpch_loader.py new file mode 100644 index 00000000..a0e6622d --- /dev/null +++ b/data/tpch_loader.py @@ -0,0 +1,198 @@ +import yaml +from typing import Any, Dict, List, Optional +from pathlib import Path + +import numpy as np + +from utils import EventTime +from workload import ( + Workload, + WorkProfile, + Job, + JobGraph, + ExecutionStrategy, + ExecutionStrategies, + Resource, + Resources, +) + +from .base_workload_loader import BaseWorkloadLoader + + +class TpchLoader(BaseWorkloadLoader): + """Loads the TPCH trace from the provided file + + Args: + path (`str`): Path to a YAML file specifying the TPC-H query DAGs + """ + def __init__(self, path: str) -> None: + with open(path, "r") as f: + workload_data = yaml.safe_load(f) + + job_graphs = {} + for query in workload_data["graphs"]: + query_name = query["name"] + graph = query["graph"] + + job_graph = TpchLoader.parse_job_graph(query_name=query_name, graph=graph) + job_graphs[query_name] = job_graph + + self._workloads = iter([Workload.from_job_graphs(job_graphs)]) + + + @staticmethod + def parse_job_graph(query_name: str, graph: List[Dict[str, Any]]) -> JobGraph: + job_graph = JobGraph( + name=query_name, + + # TODO: make configurable + release_policy=JobGraph.ReleasePolicy.fixed( + period=EventTime(30, EventTime.Unit.US), + num_invocations=10, + start=EventTime(0, EventTime.Unit.US), + ), + + # TODO: make configurable + deadline_variance=(0,0), + ) + + # TODO: make configurable + profiler_path = "./profiles/workload/tpch/decima/2g" + query_num = int(query_name[1:]) + profiler_data = TpchLoader.get_profiler_data_for_query(profiler_path, query_num) + + name_to_job = {} + for node in graph: + # TODO: make profile_path configurable + worker_profile = TpchLoader.load_query_profile( + profiler_data=profiler_data, + query_name=query_name, + node_name=node["name"], + ) + job = Job(name=node["name"], profile=worker_profile) + name_to_job[node["name"]] = job + job_graph.add_job(job=job) + + for node in graph: + job = name_to_job[node["name"]] + if "children" in node: + for child in node["children"]: + if child not in name_to_job: + raise ValueError( + f"Child {child} of {node['name']} was " + f"not present in the graph." + ) + child_job = name_to_job[child] + job_graph.add_child(job, child_job) + + return job_graph + + + @staticmethod + def load_query_profile(profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str) -> WorkProfile: + profile = profiler_data[int(node_name)] + resources = Resources( + resource_vector={ + Resource(name="Slot", _id="any"): profile["num_tasks"], + }, + ) + execution_strategies = ExecutionStrategies() + execution_strategies.add_strategy( + strategy=ExecutionStrategy( + resources=resources, + batch_size=1, + runtime=profile["avg_task_duration"], + ), + ) + return WorkProfile( + name=f"{query_name}_{node_name}_execution_profile", + execution_strategies=execution_strategies, + ) + + + @staticmethod + def get_profiler_data_for_query(profile_path: str, query_num: int) -> Dict[int, Dict[str, Any]]: + def pre_process_task_duration(task_duration): + # remove fresh durations from first wave + clean_first_wave = {} + for e in task_duration["first_wave"]: + clean_first_wave[e] = [] + fresh_durations = SetWithCount() + for d in task_duration["fresh_durations"][e]: + fresh_durations.add(d) + for d in task_duration["first_wave"][e]: + if d not in fresh_durations: + clean_first_wave[e].append(d) + else: + # prevent duplicated fresh duration blocking first wave + fresh_durations.remove(d) + + task_durations = np.load( + Path(profile_path) / f"task_duration_{query_num}.npy", + allow_pickle=True, + ).item() + + num_nodes = len(task_durations) + + stage_info = {} + + for n in range(num_nodes): + task_duration = task_durations[n] + e = next(iter(task_duration["first_wave"])) + # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} + + num_tasks = len(task_duration["first_wave"][e]) + len( + task_duration["rest_wave"][e] + ) + + # remove fresh duration from first wave duration + # drag nearest neighbor first wave duration to empty spots + pre_process_task_duration(task_duration) + rough_duration = np.mean( + [i for t in task_duration["first_wave"].values() for i in t] + + [i for t in task_duration["rest_wave"].values() for i in t] + + [i for t in task_duration["fresh_durations"].values() for i in t] + ) + + curr_stage = { + "stage_id": n, + "num_tasks": num_tasks, + "avg_task_duration": round(rough_duration), + } + stage_info[n] = curr_stage + + return stage_info + + + def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: + try: + return next(self._workloads) + except StopIteration: + return None + + +class SetWithCount(object): + """ + allow duplication in set + """ + + def __init__(self): + self.set = {} + + def __contains__(self, item): + return item in self.set + + def add(self, item): + if item in self.set: + self.set[item] += 1 + else: + self.set[item] = 1 + + def clear(self): + self.set.clear() + + def remove(self, item): + self.set[item] -= 1 + if self.set[item] == 0: + del self.set[item] + diff --git a/main.py b/main.py index b2df225a..3a58f6e8 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ TaskLoaderBenchmark, TaskLoaderPylot, TaskLoaderSynthetic, + TpchLoader, WorkerLoader, WorkerLoaderBenchmark, WorkloadLoader, @@ -34,7 +35,7 @@ flags.DEFINE_enum( "replay_trace", "pylot", - ["pylot", "clockwork_bursty", "alibaba"], + ["pylot", "clockwork_bursty", "alibaba", "tpch"], "Sets the trace to replay in the replay mode.", ) flags.DEFINE_string( @@ -130,6 +131,13 @@ "benchmark_num_cpus", 10, "Number of CPUs available for benchmarking." ) +# TPCH related flags +flags.DEFINE_string( + "tpch_query_dag_spec", + "./profiles/workload/tpch/queries.yaml", + "Path to a YAML file specifying the TPC-H query DAGs", +) + # AlibabaLoader related flags. flags.DEFINE_integer( "alibaba_loader_task_cpu_multiplier", @@ -633,6 +641,10 @@ def main(args): ), flags=FLAGS, ) + elif FLAGS.replay_trace == "tpch": + workload_loader = TpchLoader( + path=FLAGS.tpch_query_dag_spec, + ) else: raise NotImplementedError( f"Replay trace {FLAGS.replay_trace} is not implemented yet." diff --git a/rpc/tpch_utils.py b/rpc/tpch_utils.py index ebc4e3cd..9c1b2e55 100644 --- a/rpc/tpch_utils.py +++ b/rpc/tpch_utils.py @@ -8,91 +8,17 @@ import networkx as nx import numpy as np +from data import TpchLoader + HOME_TPCH_DIR = "../profiles/workload/tpch_decima/" TPCH_SUBDIR = "2g/" -class SetWithCount(object): - """ - allow duplication in set - """ - - def __init__(self): - self.set = {} - - def __contains__(self, item): - return item in self.set - - def add(self, item): - if item in self.set: - self.set[item] += 1 - else: - self.set[item] = 1 - - def clear(self): - self.set.clear() - - def remove(self, item): - self.set[item] -= 1 - if self.set[item] == 0: - del self.set[item] - - -def pre_process_task_duration(task_duration): - # remove fresh durations from first wave - clean_first_wave = {} - for e in task_duration["first_wave"]: - clean_first_wave[e] = [] - fresh_durations = SetWithCount() - # O(1) access - for d in task_duration["fresh_durations"][e]: - fresh_durations.add(d) - for d in task_duration["first_wave"][e]: - if d not in fresh_durations: - clean_first_wave[e].append(d) - else: - # prevent duplicated fresh duration blocking first wave - fresh_durations.remove(d) - - def get_all_stage_info_for_query(query_num): - task_durations = np.load( - os.path.join( - HOME_TPCH_DIR, TPCH_SUBDIR, "task_duration_" + str(query_num) + ".npy" - ), - allow_pickle=True, - ).item() - - num_nodes = len(task_durations) - - stage_info = {} - - for n in range(num_nodes): - task_duration = task_durations[n] - e = next(iter(task_duration["first_wave"])) - # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} - - num_tasks = len(task_duration["first_wave"][e]) + len( - task_duration["rest_wave"][e] - ) - - # remove fresh duration from first wave duration - # drag nearest neighbor first wave duration to empty spots - pre_process_task_duration(task_duration) - rough_duration = np.mean( - [i for t in task_duration["first_wave"].values() for i in t] - + [i for t in task_duration["rest_wave"].values() for i in t] - + [i for t in task_duration["fresh_durations"].values() for i in t] - ) - - curr_stage = { - "stage_id": n, - "num_tasks": num_tasks, - "avg_task_duration": round(rough_duration), - } - stage_info[n] = curr_stage - - return stage_info + return TpchLoader.get_profiler_data_for_query( + profile_path=os.path.join(HOME_TPCH_DIR, TPCH_SUBDIR), + query_num=query_num, + ) def get_base_tpch_graph_structure(query_num): From a25fbe864d0807482d8228da2f33c64e3e0d211b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 5 Sep 2024 10:10:14 -0400 Subject: [PATCH 002/128] Bug fix: convert job graph to task graph --- data/tpch_loader.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index a0e6622d..39d1fc9f 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -1,4 +1,6 @@ +import sys import yaml + from typing import Any, Dict, List, Optional from pathlib import Path @@ -37,7 +39,11 @@ def __init__(self, path: str) -> None: job_graph = TpchLoader.parse_job_graph(query_name=query_name, graph=graph) job_graphs[query_name] = job_graph - self._workloads = iter([Workload.from_job_graphs(job_graphs)]) + # TODO: configurable? + loop_timeout = EventTime(time=sys.maxsize, unit=EventTime.Unit.US) + workload = Workload.from_job_graphs(job_graphs) + workload.populate_task_graphs(completion_time=loop_timeout) + self._workloads = iter([workload]) @staticmethod @@ -101,7 +107,7 @@ def load_query_profile(profiler_data: Dict[int, Dict[str, Any]], query_name: str strategy=ExecutionStrategy( resources=resources, batch_size=1, - runtime=profile["avg_task_duration"], + runtime=EventTime(profile["avg_task_duration"], EventTime.Unit.US), ), ) return WorkProfile( From 4d06a95be805c23cda438e650fc4b3fef0eadcde Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 5 Sep 2024 10:37:09 -0400 Subject: [PATCH 003/128] Make loop_timeout configurable --- data/tpch_loader.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 39d1fc9f..d8999d8c 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -1,10 +1,11 @@ import sys -import yaml from typing import Any, Dict, List, Optional from pathlib import Path +import absl #noqa: F401 import numpy as np +import yaml from utils import EventTime from workload import ( @@ -27,7 +28,12 @@ class TpchLoader(BaseWorkloadLoader): Args: path (`str`): Path to a YAML file specifying the TPC-H query DAGs """ - def __init__(self, path: str) -> None: + def __init__(self, path: str, _flags: Optional["absl.flags"] = None) -> None: + if _flags: + self._loop_timeout = _flags.loop_timeout + else: + self._loop_timeout = EventTime(time=sys.maxsize, unit=EventTime.Unit.US) + with open(path, "r") as f: workload_data = yaml.safe_load(f) @@ -39,10 +45,8 @@ def __init__(self, path: str) -> None: job_graph = TpchLoader.parse_job_graph(query_name=query_name, graph=graph) job_graphs[query_name] = job_graph - # TODO: configurable? - loop_timeout = EventTime(time=sys.maxsize, unit=EventTime.Unit.US) workload = Workload.from_job_graphs(job_graphs) - workload.populate_task_graphs(completion_time=loop_timeout) + workload.populate_task_graphs(completion_time=self._loop_timeout) self._workloads = iter([workload]) From c756d17454bd14c4805d6815ef09ae77e8718412 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 5 Sep 2024 10:48:55 -0400 Subject: [PATCH 004/128] Make profile path configurable --- data/tpch_loader.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index d8999d8c..63467f1a 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -27,12 +27,15 @@ class TpchLoader(BaseWorkloadLoader): Args: path (`str`): Path to a YAML file specifying the TPC-H query DAGs + _flags (`absl.flags`): The flags used to initialize the app, if any """ def __init__(self, path: str, _flags: Optional["absl.flags"] = None) -> None: if _flags: self._loop_timeout = _flags.loop_timeout + self._workload_profile_path = _flags.workload_profile_path else: self._loop_timeout = EventTime(time=sys.maxsize, unit=EventTime.Unit.US) + self._workload_profile_path = "./profiles/workload/tpch/decima/2g" with open(path, "r") as f: workload_data = yaml.safe_load(f) @@ -41,8 +44,11 @@ def __init__(self, path: str, _flags: Optional["absl.flags"] = None) -> None: for query in workload_data["graphs"]: query_name = query["name"] graph = query["graph"] - - job_graph = TpchLoader.parse_job_graph(query_name=query_name, graph=graph) + job_graph = TpchLoader.make_job_graph( + query_name=query_name, + graph=graph, + profile_path=self._workload_profile_path, + ) job_graphs[query_name] = job_graph workload = Workload.from_job_graphs(job_graphs) @@ -51,7 +57,7 @@ def __init__(self, path: str, _flags: Optional["absl.flags"] = None) -> None: @staticmethod - def parse_job_graph(query_name: str, graph: List[Dict[str, Any]]) -> JobGraph: + def make_job_graph(query_name: str, graph: List[Dict[str, Any]], profile_path: str) -> JobGraph: job_graph = JobGraph( name=query_name, @@ -66,14 +72,11 @@ def parse_job_graph(query_name: str, graph: List[Dict[str, Any]]) -> JobGraph: deadline_variance=(0,0), ) - # TODO: make configurable - profiler_path = "./profiles/workload/tpch/decima/2g" query_num = int(query_name[1:]) - profiler_data = TpchLoader.get_profiler_data_for_query(profiler_path, query_num) + profiler_data = TpchLoader.get_profiler_data_for_query(profile_path, query_num) name_to_job = {} for node in graph: - # TODO: make profile_path configurable worker_profile = TpchLoader.load_query_profile( profiler_data=profiler_data, query_name=query_name, From 9f6aa8be224a8297336ae61b85cad8bfca978202 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 19 Sep 2024 14:19:42 -0400 Subject: [PATCH 005/128] release time handling on workload --- configs/tpch_test.conf | 4 ++ data/tpch_loader.py | 143 ++++++++++++++++++++++++++++++++--------- main.py | 12 ++++ requirements.txt | 1 + 4 files changed, 128 insertions(+), 32 deletions(-) diff --git a/configs/tpch_test.conf b/configs/tpch_test.conf index 6da9d431..a7fe9073 100644 --- a/configs/tpch_test.conf +++ b/configs/tpch_test.conf @@ -15,5 +15,9 @@ --execution_mode=replay --replay_trace=tpch +# Release time config. +--override_arrival_period=5 +--override_num_invocation=10 + # TPCH flags --tpch_query_dag_spec=profiles/workload/tpch/queries.yaml diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 63467f1a..bd2fedf7 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -1,12 +1,15 @@ -import sys +import random +from pathlib import Path from typing import Any, Dict, List, Optional from pathlib import Path -import absl #noqa: F401 +import absl import numpy as np import yaml +from more_itertools import before_and_after + from utils import EventTime from workload import ( Workload, @@ -22,20 +25,40 @@ from .base_workload_loader import BaseWorkloadLoader +""" +- [ ] Release policy based on workload +- [ ] Fix current time setting +- [ ] Configure deadline variance +- [ ] Configure release policy +""" + + class TpchLoader(BaseWorkloadLoader): """Loads the TPCH trace from the provided file - + Args: path (`str`): Path to a YAML file specifying the TPC-H query DAGs - _flags (`absl.flags`): The flags used to initialize the app, if any + flags (`absl.flags`): The flags used to initialize the app, if any """ - def __init__(self, path: str, _flags: Optional["absl.flags"] = None) -> None: - if _flags: - self._loop_timeout = _flags.loop_timeout - self._workload_profile_path = _flags.workload_profile_path + + def __init__(self, path: str, flags: "absl.flags") -> None: + self._flags = flags + self._rng_seed = flags.random_seed + self._rng = random.Random(self._rng_seed) + self._loop_timeout = flags.loop_timeout + self._num_queries = flags.tpch_num_queries + self._dataset_size = flags.tpch_dataset_size + if flags.workload_profile_path: + self._workload_profile_path = str( + Path(flags.workload_profile_path) / f"{self._dataset_size}g" + ) else: - self._loop_timeout = EventTime(time=sys.maxsize, unit=EventTime.Unit.US) self._workload_profile_path = "./profiles/workload/tpch/decima/2g" + self._workload_update_interval = EventTime(10, EventTime.Unit.US) + release_policy = self._get_release_policy() + self._release_times = release_policy.get_release_times( + completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) + ) with open(path, "r") as f: workload_data = yaml.safe_load(f) @@ -51,25 +74,72 @@ def __init__(self, path: str, _flags: Optional["absl.flags"] = None) -> None: ) job_graphs[query_name] = job_graph - workload = Workload.from_job_graphs(job_graphs) - workload.populate_task_graphs(completion_time=self._loop_timeout) - self._workloads = iter([workload]) + self._job_graphs = job_graphs + self._workload = Workload.empty(flags) + def _get_release_policy(self): + release_policy_args = {} + if self._flags.override_release_policy == "periodic": + release_policy_args = { + "period": EventTime( + self._flags.override_arrival_period, EventTime.Unit.US + ), + } + elif self._flags.override_release_policy == "fixed": + release_policy_args = { + "period": EventTime( + self._flags.override_arrival_period, EventTime.Unit.US + ), + "num_invocations": self._flags.override_num_invocation, + } + elif self._flags.override_release_policy == "poisson": + release_policy_args = { + "rate": self._flags.override_poisson_arrival_rate, + "num_invocations": self._flags.override_num_invocation, + } + elif self._flags.override_release_policy == "gamma": + release_policy_args = { + "rate": self._flags.override_poisson_arrival_rate, + "num_invocations": self._flags.override_num_invocation, + "coefficient": self._flags.override_gamma_coefficient, + } + elif self._flags.override_release_policy == "fixed_gamma": + release_policy_args = { + "variable_arrival_rate": self._flags.override_poisson_arrival_rate, + "base_arrival_rate": self._flags.override_base_arrival_rate, + "num_invocations": self._flags.override_num_invocation, + "coefficient": self._flags.override_gamma_coefficient, + } + else: + raise NotImplementedError( + f"Release policy {self._flags.override_release_policy} not implemented." + ) - @staticmethod - def make_job_graph(query_name: str, graph: List[Dict[str, Any]], profile_path: str) -> JobGraph: - job_graph = JobGraph( - name=query_name, + # Check that none of the arg values are None + assert all([val is not None for val in release_policy_args.values()]) - # TODO: make configurable - release_policy=JobGraph.ReleasePolicy.fixed( - period=EventTime(30, EventTime.Unit.US), - num_invocations=10, - start=EventTime(0, EventTime.Unit.US), + # Construct the release policy + start_time = EventTime( + time=self._rng.randint( + self._flags.randomize_start_time_min, + self._flags.randomize_start_time_max, ), + unit=EventTime.Unit.US, + ) + release_policy = getattr( + JobGraph.ReleasePolicy, self._flags.override_release_policy + )(start=start_time, rng_seed=self._rng_seed, **release_policy_args) + + return release_policy + @staticmethod + def make_job_graph( + query_name: str, graph: List[Dict[str, Any]], profile_path: str + ) -> JobGraph: + job_graph = JobGraph( + name=query_name, # TODO: make configurable - deadline_variance=(0,0), + deadline_variance=(10, 50), ) query_num = int(query_name[1:]) @@ -100,9 +170,10 @@ def make_job_graph(query_name: str, graph: List[Dict[str, Any]], profile_path: s return job_graph - @staticmethod - def load_query_profile(profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str) -> WorkProfile: + def load_query_profile( + profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str + ) -> WorkProfile: profile = profiler_data[int(node_name)] resources = Resources( resource_vector={ @@ -122,9 +193,10 @@ def load_query_profile(profiler_data: Dict[int, Dict[str, Any]], query_name: str execution_strategies=execution_strategies, ) - @staticmethod - def get_profiler_data_for_query(profile_path: str, query_num: int) -> Dict[int, Dict[str, Any]]: + def get_profiler_data_for_query( + profile_path: str, query_num: int + ) -> Dict[int, Dict[str, Any]]: def pre_process_task_duration(task_duration): # remove fresh durations from first wave clean_first_wave = {} @@ -152,7 +224,6 @@ def pre_process_task_duration(task_duration): for n in range(num_nodes): task_duration = task_durations[n] e = next(iter(task_duration["first_wave"])) - # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} num_tasks = len(task_duration["first_wave"][e]) + len( task_duration["rest_wave"][e] @@ -176,12 +247,21 @@ def pre_process_task_duration(task_duration): return stage_info - def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: - try: - return next(self._workloads) - except StopIteration: + if len(self._release_times) == 0: return None + to_release, self._release_times = before_and_after(lambda t: t <= current_time + self._workload_update_interval, self._release_times) + for t in to_release: + query_num = self._rng.randint(1, len(self._job_graphs)) + query_name = f"Q{query_num}" + job_graph = self._job_graphs[query_name] + task_graph = job_graph.get_next_task_graph( + start_time=t, + _flags=self._flags, + ) + self._workload.add_task_graph(task_graph) + self._release_times = list(self._release_times) + return self._workload class SetWithCount(object): @@ -208,4 +288,3 @@ def remove(self, item): self.set[item] -= 1 if self.set[item] == 0: del self.set[item] - diff --git a/main.py b/main.py index 3a58f6e8..281a54cf 100644 --- a/main.py +++ b/main.py @@ -137,6 +137,17 @@ "./profiles/workload/tpch/queries.yaml", "Path to a YAML file specifying the TPC-H query DAGs", ) +flags.DEFINE_integer( + "tpch_num_queries", + 50, + "Number of TPC-H queries to run", +) +flags.DEFINE_enum( + "tpch_dataset_size", + "50", + ["2", "50", "100", "250", "500"], + "Size of the TPC-H dataset to use", +) # AlibabaLoader related flags. flags.DEFINE_integer( @@ -644,6 +655,7 @@ def main(args): elif FLAGS.replay_trace == "tpch": workload_loader = TpchLoader( path=FLAGS.tpch_query_dag_spec, + flags=FLAGS, ) else: raise NotImplementedError( diff --git a/requirements.txt b/requirements.txt index f3e8957c..cfb3347d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ cplex pre-commit black isort +more-itertools From be667046e80d8227452e162053c2cc0d8fe1fe2a Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 23 Sep 2024 08:35:59 -0400 Subject: [PATCH 006/128] Wrap up tpch loader implementation --- configs/{tpch_test.conf => tpch_replay.conf} | 9 ++- data/tpch_loader.py | 68 ++++++++++++-------- profiles/workers/tpch_cluster.yaml | 6 ++ requirements.txt | 1 - simulator.py | 26 ++++---- 5 files changed, 67 insertions(+), 43 deletions(-) rename configs/{tpch_test.conf => tpch_replay.conf} (60%) create mode 100644 profiles/workers/tpch_cluster.yaml diff --git a/configs/tpch_test.conf b/configs/tpch_replay.conf similarity index 60% rename from configs/tpch_test.conf rename to configs/tpch_replay.conf index a7fe9073..3108e603 100644 --- a/configs/tpch_test.conf +++ b/configs/tpch_replay.conf @@ -10,14 +10,19 @@ --scheduler=EDF --scheduler_runtime=0 --enforce_deadlines +--min_deadline_variance=25 +--max_deadline_variance=50 # Execution mode configs. --execution_mode=replay --replay_trace=tpch # Release time config. ---override_arrival_period=5 ---override_num_invocation=10 +--override_release_policy=gamma +--override_gamma_coefficient=1 +--override_poisson_arrival_rate=0.04 +--override_num_invocation=50 # TPCH flags --tpch_query_dag_spec=profiles/workload/tpch/queries.yaml +--worker_profile_path=profiles/workers/tpch_cluster.yaml diff --git a/data/tpch_loader.py b/data/tpch_loader.py index bd2fedf7..feaf09a6 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -1,5 +1,5 @@ +import sys import random -from pathlib import Path from typing import Any, Dict, List, Optional from pathlib import Path @@ -8,8 +8,6 @@ import numpy as np import yaml -from more_itertools import before_and_after - from utils import EventTime from workload import ( Workload, @@ -25,14 +23,6 @@ from .base_workload_loader import BaseWorkloadLoader -""" -- [ ] Release policy based on workload -- [ ] Fix current time setting -- [ ] Configure deadline variance -- [ ] Configure release policy -""" - - class TpchLoader(BaseWorkloadLoader): """Loads the TPCH trace from the provided file @@ -45,24 +35,28 @@ def __init__(self, path: str, flags: "absl.flags") -> None: self._flags = flags self._rng_seed = flags.random_seed self._rng = random.Random(self._rng_seed) - self._loop_timeout = flags.loop_timeout - self._num_queries = flags.tpch_num_queries - self._dataset_size = flags.tpch_dataset_size - if flags.workload_profile_path: - self._workload_profile_path = str( - Path(flags.workload_profile_path) / f"{self._dataset_size}g" - ) + if flags.workload_update_interval > 0: + self._workload_update_interval = flags.workload_update_interval else: - self._workload_profile_path = "./profiles/workload/tpch/decima/2g" - self._workload_update_interval = EventTime(10, EventTime.Unit.US) + self._workload_update_interval = EventTime(sys.maxsize, EventTime.Unit.US) release_policy = self._get_release_policy() self._release_times = release_policy.get_release_times( completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) ) + self._current_release_pointer = 0 + + # Set up query name to job graph mapping with open(path, "r") as f: workload_data = yaml.safe_load(f) + if flags.workload_profile_path: + workload_profile_path = str( + Path(flags.workload_profile_path) / f"{flags.s.tpch_dataset_size}g" + ) + else: + workload_profile_path = "./profiles/workload/tpch/decima/2g" + job_graphs = {} for query in workload_data["graphs"]: query_name = query["name"] @@ -70,11 +64,17 @@ def __init__(self, path: str, flags: "absl.flags") -> None: job_graph = TpchLoader.make_job_graph( query_name=query_name, graph=graph, - profile_path=self._workload_profile_path, + profile_path=workload_profile_path, + deadline_variance=( + int(flags.min_deadline_variance), + int(flags.max_deadline_variance), + ) ) job_graphs[query_name] = job_graph self._job_graphs = job_graphs + + # Initialize workload self._workload = Workload.empty(flags) def _get_release_policy(self): @@ -134,12 +134,11 @@ def _get_release_policy(self): @staticmethod def make_job_graph( - query_name: str, graph: List[Dict[str, Any]], profile_path: str + query_name: str, graph: List[Dict[str, Any]], profile_path: str, deadline_variance=(0,0), ) -> JobGraph: job_graph = JobGraph( name=query_name, - # TODO: make configurable - deadline_variance=(10, 50), + deadline_variance=deadline_variance, ) query_num = int(query_name[1:]) @@ -248,9 +247,24 @@ def pre_process_task_duration(task_duration): return stage_info def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: - if len(self._release_times) == 0: + to_release = [] + while ( + self._current_release_pointer < len(self._release_times) + and self._release_times[self._current_release_pointer] + <= current_time + self._workload_update_interval + ): + to_release.append( + self._release_times[self._current_release_pointer] + ) + self._current_release_pointer += 1 + + if ( + self._current_release_pointer >= len(self._release_times) + and len(to_release) == 0 + ): + # Nothing left to release return None - to_release, self._release_times = before_and_after(lambda t: t <= current_time + self._workload_update_interval, self._release_times) + for t in to_release: query_num = self._rng.randint(1, len(self._job_graphs)) query_name = f"Q{query_num}" @@ -260,7 +274,7 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: _flags=self._flags, ) self._workload.add_task_graph(task_graph) - self._release_times = list(self._release_times) + return self._workload diff --git a/profiles/workers/tpch_cluster.yaml b/profiles/workers/tpch_cluster.yaml new file mode 100644 index 00000000..592ba42d --- /dev/null +++ b/profiles/workers/tpch_cluster.yaml @@ -0,0 +1,6 @@ +- name: WorkerPool_1 + workers: + - name: Worker_1_1 + resources: + - name: Slot + quantity: 500 diff --git a/requirements.txt b/requirements.txt index cfb3347d..f3e8957c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,3 @@ cplex pre-commit black isort -more-itertools diff --git a/simulator.py b/simulator.py index 1ccf06fd..48ffc607 100644 --- a/simulator.py +++ b/simulator.py @@ -1539,19 +1539,19 @@ def __handle_update_workload(self, event: Event) -> None: len(releasable_tasks), ) - # Add the TaskGraphRelease events into the system. - for task_graph_name, task_graph in self._workload.task_graphs.items(): - event = Event( - event_type=EventType.TASK_GRAPH_RELEASE, - time=task_graph.release_time, - task_graph=task_graph_name, - ) - self._event_queue.add_event(event) - self._logger.info( - "[%s] Added %s to the event queue.", - self._simulator_time.to(EventTime.Unit.US).time, - event, - ) + # # Add the TaskGraphRelease events into the system. + # for task_graph_name, task_graph in self._workload.task_graphs.items(): + # event = Event( + # event_type=EventType.TASK_GRAPH_RELEASE, + # time=task_graph.release_time, + # task_graph=task_graph_name, + # ) + # self._event_queue.add_event(event) + # self._logger.info( + # "[%s] Added %s to the event queue.", + # self._simulator_time.to(EventTime.Unit.US).time, + # event, + # ) max_release_time = self._simulator_time for task in releasable_tasks: From a4d0dedd6a65477f38a5c527c1855832eaadb5ea Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 1 Oct 2024 12:18:45 -0400 Subject: [PATCH 007/128] scale runtime based on max number of tasks --- configs/tpch_replay.conf | 24 +++++++--- data/tpch_loader.py | 70 +++++++++++++++++++++++------- main.py | 10 +++++ profiles/workers/tpch_cluster.yaml | 2 +- 4 files changed, 85 insertions(+), 21 deletions(-) diff --git a/configs/tpch_replay.conf b/configs/tpch_replay.conf index 3108e603..cb0dc26f 100644 --- a/configs/tpch_replay.conf +++ b/configs/tpch_replay.conf @@ -1,15 +1,28 @@ # Output configs. ---log=./tpch_test.log +--log=./tpch_replay.log --log_level=debug ---csv=./tpch_test.csv +--csv=./tpch_replay.csv # Task configs. --runtime_variance=0 # Scheduler configs. ---scheduler=EDF + +# EDF +# --scheduler=EDF +# --scheduler_runtime=0 +# --enforce_deadlines + +# DSched +--scheduler=TetriSched_Gurobi --scheduler_runtime=0 --enforce_deadlines +--retract_schedules +--release_taskgraphs +--drop_skipped_tasks +--scheduler_time_discretization=1 + +# Deadline variance --min_deadline_variance=25 --max_deadline_variance=50 @@ -20,9 +33,10 @@ # Release time config. --override_release_policy=gamma --override_gamma_coefficient=1 ---override_poisson_arrival_rate=0.04 ---override_num_invocation=50 +--override_poisson_arrival_rate=0.01 +--override_num_invocation=1 # TPCH flags +--random_seed=1234 --tpch_query_dag_spec=profiles/workload/tpch/queries.yaml --worker_profile_path=profiles/workers/tpch_cluster.yaml diff --git a/data/tpch_loader.py b/data/tpch_loader.py index feaf09a6..7139af6d 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -1,3 +1,4 @@ +import math import sys import random @@ -8,7 +9,7 @@ import numpy as np import yaml -from utils import EventTime +from utils import EventTime, setup_logging from workload import ( Workload, WorkProfile, @@ -33,6 +34,12 @@ class TpchLoader(BaseWorkloadLoader): def __init__(self, path: str, flags: "absl.flags") -> None: self._flags = flags + self._logger = setup_logging( + name=self.__class__.__name__, + log_dir=flags.log_dir, + log_file=flags.log_file_name, + log_level=flags.log_level, + ) self._rng_seed = flags.random_seed self._rng = random.Random(self._rng_seed) if flags.workload_update_interval > 0: @@ -43,6 +50,7 @@ def __init__(self, path: str, flags: "absl.flags") -> None: self._release_times = release_policy.get_release_times( completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) ) + self._current_release_pointer = 0 # Set up query name to job graph mapping @@ -61,14 +69,14 @@ def __init__(self, path: str, flags: "absl.flags") -> None: for query in workload_data["graphs"]: query_name = query["name"] graph = query["graph"] - job_graph = TpchLoader.make_job_graph( + job_graph = self.make_job_graph( query_name=query_name, graph=graph, profile_path=workload_profile_path, deadline_variance=( int(flags.min_deadline_variance), int(flags.max_deadline_variance), - ) + ), ) job_graphs[query_name] = job_graph @@ -132,9 +140,12 @@ def _get_release_policy(self): return release_policy - @staticmethod def make_job_graph( - query_name: str, graph: List[Dict[str, Any]], profile_path: str, deadline_variance=(0,0), + self, + query_name: str, + graph: List[Dict[str, Any]], + profile_path: str, + deadline_variance=(0, 0), ) -> JobGraph: job_graph = JobGraph( name=query_name, @@ -146,7 +157,7 @@ def make_job_graph( name_to_job = {} for node in graph: - worker_profile = TpchLoader.load_query_profile( + worker_profile = self.make_work_profile( profiler_data=profiler_data, query_name=query_name, node_name=node["name"], @@ -169,14 +180,45 @@ def make_job_graph( return job_graph - @staticmethod - def load_query_profile( - profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str + def make_work_profile( + self, profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str ) -> WorkProfile: profile = profiler_data[int(node_name)] + + num_tasks = min(self._flags.tpch_max_executors_per_job, profile["num_tasks"]) + + # adjust runtime based on num_tasks + runtime = max( + self._flags.tpch_min_task_runtime, + ( + profile["avg_task_duration"] + if profile["num_tasks"] <= self._flags.tpch_max_executors_per_job + else math.ceil( + (profile["num_tasks"] * profile["avg_task_duration"]) + / self._flags.tpch_max_executors_per_job + ) + ), + ) + + if profile["num_tasks"] > self._flags.tpch_max_executors_per_job: + self._logger.debug( + "%s@%s: Profiled slots > tpch_max_executors_per_job: %s. Converted " + "(slots,runtime) from (%s,%sms) to (%s, %sms)", + node_name, + query_name, + self._flags.tpch_max_executors_per_job, + profile["num_tasks"], + profile["avg_task_duration"], + num_tasks, + runtime, + ) + + # convert runtime to us, it is in millseconds + runtime = round(max(1, runtime / 1e3)) + resources = Resources( resource_vector={ - Resource(name="Slot", _id="any"): profile["num_tasks"], + Resource(name="Slot", _id="any"): num_tasks, }, ) execution_strategies = ExecutionStrategies() @@ -184,7 +226,7 @@ def load_query_profile( strategy=ExecutionStrategy( resources=resources, batch_size=1, - runtime=EventTime(profile["avg_task_duration"], EventTime.Unit.US), + runtime=EventTime(runtime, EventTime.Unit.US), ), ) return WorkProfile( @@ -240,7 +282,7 @@ def pre_process_task_duration(task_duration): curr_stage = { "stage_id": n, "num_tasks": num_tasks, - "avg_task_duration": round(rough_duration), + "avg_task_duration": round(rough_duration), # in milliseconds } stage_info[n] = curr_stage @@ -253,9 +295,7 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: and self._release_times[self._current_release_pointer] <= current_time + self._workload_update_interval ): - to_release.append( - self._release_times[self._current_release_pointer] - ) + to_release.append(self._release_times[self._current_release_pointer]) self._current_release_pointer += 1 if ( diff --git a/main.py b/main.py index 281a54cf..ade63c80 100644 --- a/main.py +++ b/main.py @@ -148,6 +148,16 @@ ["2", "50", "100", "250", "500"], "Size of the TPC-H dataset to use", ) +flags.DEFINE_integer( + "tpch_max_executors_per_job", + 50, + "Maximum number of executors to use per TPC-H query stage", +) +flags.DEFINE_integer( + "tpch_min_task_runtime", + 8, + "Minimum runtime of a TPC-H task", +) # AlibabaLoader related flags. flags.DEFINE_integer( diff --git a/profiles/workers/tpch_cluster.yaml b/profiles/workers/tpch_cluster.yaml index 592ba42d..582302b2 100644 --- a/profiles/workers/tpch_cluster.yaml +++ b/profiles/workers/tpch_cluster.yaml @@ -3,4 +3,4 @@ - name: Worker_1_1 resources: - name: Slot - quantity: 500 + quantity: 640 From 8111ba66d422b4193f80427c29c3e9588b6bacfe Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 2 Oct 2024 13:21:19 -0400 Subject: [PATCH 008/128] fix bug in runtime calc --- data/tpch_loader.py | 19 ++++++++----------- main.py | 8 ++++++++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 7139af6d..be24d12e 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -188,16 +188,13 @@ def make_work_profile( num_tasks = min(self._flags.tpch_max_executors_per_job, profile["num_tasks"]) # adjust runtime based on num_tasks - runtime = max( - self._flags.tpch_min_task_runtime, - ( - profile["avg_task_duration"] - if profile["num_tasks"] <= self._flags.tpch_max_executors_per_job - else math.ceil( - (profile["num_tasks"] * profile["avg_task_duration"]) - / self._flags.tpch_max_executors_per_job - ) - ), + runtime = ( + profile["avg_task_duration"] + if profile["num_tasks"] <= self._flags.tpch_max_executors_per_job + else math.ceil( + (profile["num_tasks"] * profile["avg_task_duration"]) + / self._flags.tpch_max_executors_per_job + ) ) if profile["num_tasks"] > self._flags.tpch_max_executors_per_job: @@ -214,7 +211,7 @@ def make_work_profile( ) # convert runtime to us, it is in millseconds - runtime = round(max(1, runtime / 1e3)) + runtime = round(max(self._flags.tpch_min_task_runtime, runtime / 1e3)) resources = Resources( resource_vector={ diff --git a/main.py b/main.py index ade63c80..b4111d33 100644 --- a/main.py +++ b/main.py @@ -159,6 +159,14 @@ "Minimum runtime of a TPC-H task", ) +flags.DEFINE_list( + "opt_passes", + [], + "A list of timestamps (in µs) at which to request extra logging from the Scheduler." + "If scheduler_log_to_file is `True`, then extra information will be requested for " + "all timestamps.", +) + # AlibabaLoader related flags. flags.DEFINE_integer( "alibaba_loader_task_cpu_multiplier", From e172b56ae1fa55b4273a315285cc6c29231d903c Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 09:48:20 -0500 Subject: [PATCH 009/128] rename optimization_passes flag to opt_passes --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b2df225a..2d2f3ac4 100644 --- a/main.py +++ b/main.py @@ -473,7 +473,7 @@ "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.", ) flags.DEFINE_multi_enum( - "optimization_passes", + "opt_passes", [], [ "CRITICAL_PATH_PASS", From 90e696cd0fd9ac69b0772b3cb9eb6959f34d346e Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 10:10:40 -0500 Subject: [PATCH 010/128] add cloudlab support, fix runtime rounding bug, make rng gen match service --- data/tpch_loader.py | 436 ++++++++++++++++++++++++++++---------------- main.py | 6 + 2 files changed, 287 insertions(+), 155 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index be24d12e..94962d61 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -1,8 +1,10 @@ +import os import math +import json import sys import random -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Callable from pathlib import Path import absl @@ -13,6 +15,7 @@ from workload import ( Workload, WorkProfile, + TaskGraph, Job, JobGraph, ExecutionStrategy, @@ -46,46 +49,40 @@ def __init__(self, path: str, flags: "absl.flags") -> None: self._workload_update_interval = flags.workload_update_interval else: self._workload_update_interval = EventTime(sys.maxsize, EventTime.Unit.US) - release_policy = self._get_release_policy() - self._release_times = release_policy.get_release_times( - completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) - ) - - self._current_release_pointer = 0 - - # Set up query name to job graph mapping + # Set up task graph generators with open(path, "r") as f: workload_data = yaml.safe_load(f) - - if flags.workload_profile_path: - workload_profile_path = str( - Path(flags.workload_profile_path) / f"{flags.s.tpch_dataset_size}g" - ) - else: - workload_profile_path = "./profiles/workload/tpch/decima/2g" - - job_graphs = {} + task_graph_generators = {} for query in workload_data["graphs"]: query_name = query["name"] graph = query["graph"] - job_graph = self.make_job_graph( + gen = self.make_task_graph_generator( query_name=query_name, graph=graph, - profile_path=workload_profile_path, - deadline_variance=( - int(flags.min_deadline_variance), - int(flags.max_deadline_variance), - ), ) - job_graphs[query_name] = job_graph + task_graph_generators[query_name] = gen + self._task_graph_generators = task_graph_generators + + # Gather release times + release_policy = self._make_release_policy() + release_times = release_policy.get_release_times( + completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) + ) + + # Sample queries to be released + query_nums = [ + self._rng.randint(1, len(self._task_graph_generators)) + for _ in range(self._flags.override_num_invocation) + ] - self._job_graphs = job_graphs + self._query_nums_and_release_times = list(zip(query_nums, release_times)) + self._current_release_pointer = 0 # Initialize workload self._workload = Workload.empty(flags) - def _get_release_policy(self): + def _make_release_policy(self): release_policy_args = {} if self._flags.override_release_policy == "periodic": release_policy_args = { @@ -123,99 +120,147 @@ def _get_release_policy(self): f"Release policy {self._flags.override_release_policy} not implemented." ) - # Check that none of the arg values are None - assert all([val is not None for val in release_policy_args.values()]) - - # Construct the release policy - start_time = EventTime( - time=self._rng.randint( + return make_release_policy( + self._flags.override_release_policy, + release_policy_args, + self._rng, + self._rng_seed, + ( self._flags.randomize_start_time_min, self._flags.randomize_start_time_max, ), - unit=EventTime.Unit.US, ) - release_policy = getattr( - JobGraph.ReleasePolicy, self._flags.override_release_policy - )(start=start_time, rng_seed=self._rng_seed, **release_policy_args) - return release_policy - - def make_job_graph( + def make_task_graph_generator( self, query_name: str, graph: List[Dict[str, Any]], - profile_path: str, - deadline_variance=(0, 0), - ) -> JobGraph: - job_graph = JobGraph( - name=query_name, - deadline_variance=deadline_variance, - ) - - query_num = int(query_name[1:]) - profiler_data = TpchLoader.get_profiler_data_for_query(profile_path, query_num) + ) -> Callable[[int, EventTime, EventTime], TaskGraph]: + def h(idx: int, current_time: EventTime, start_time: EventTime): + # Construct a JobGraph + job_graph = JobGraph(name=f"{query_name}[{idx}]") + query_num = int(query_name[1:]) + profiler_data = get_all_stage_info_for_query( + query_num, + self._flags.tpch_profile_type, + self._flags.tpch_dataset_size, + self._flags.tpch_max_executors_per_job, + ) + name_to_job = {} + for node in graph: + worker_profile = self.make_work_profile( + profiler_data=profiler_data, + query_name=query_name, + node_name=node["name"], + ) + job = Job( + name=node["name"], + profile=worker_profile, + ) + name_to_job[node["name"]] = job + job_graph.add_job(job=job) + for node in graph: + job = name_to_job[node["name"]] + if "children" in node: + for child in node["children"]: + if child not in name_to_job: + raise ValueError( + f"Child {child} of {node['name']} was " + f"not present in the graph." + ) + child_job = name_to_job[child] + job_graph.add_child(job, child_job) + + # Construct TaskGraph from JobGraph + task_graph = job_graph.get_next_task_graph( + start_time=start_time, + _flags=self._flags, + ) - name_to_job = {} - for node in graph: - worker_profile = self.make_work_profile( - profiler_data=profiler_data, - query_name=query_name, - node_name=node["name"], + # Update deadline + critical_path = task_graph.get_longest_path( + weights=lambda task: (task.slowest_execution_strategy.runtime.time) + ) + critical_path_time = ( + sum( + [t.slowest_execution_strategy.runtime for t in critical_path], + start=EventTime.zero(), + ) + .to(EventTime.Unit.US) + .time + ) + deadline_variance_factor = ( + 1.0 + + ( + self._rng.randint( + self._flags.min_deadline_variance, + self._flags.max_deadline_variance, + ) + ) + / 100 + ) + task_graph_slo_time = math.ceil( + critical_path_time * deadline_variance_factor ) - job = Job(name=node["name"], profile=worker_profile) - name_to_job[node["name"]] = job - job_graph.add_job(job=job) - - for node in graph: - job = name_to_job[node["name"]] - if "children" in node: - for child in node["children"]: - if child not in name_to_job: - raise ValueError( - f"Child {child} of {node['name']} was " - f"not present in the graph." - ) - child_job = name_to_job[child] - job_graph.add_child(job, child_job) - - return job_graph + for task in task_graph.get_nodes(): + deadline = EventTime( + start_time.time + task_graph_slo_time, unit=EventTime.Unit.US + ) + task.update_deadline(deadline) + + return task_graph + + return h def make_work_profile( self, profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str ) -> WorkProfile: profile = profiler_data[int(node_name)] - num_tasks = min(self._flags.tpch_max_executors_per_job, profile["num_tasks"]) + profiled_task_slots = profile["num_tasks"] + profiled_runtime = math.ceil(profile["avg_task_duration_ms"] / 1e3) - # adjust runtime based on num_tasks - runtime = ( - profile["avg_task_duration"] - if profile["num_tasks"] <= self._flags.tpch_max_executors_per_job - else math.ceil( - (profile["num_tasks"] * profile["avg_task_duration"]) + if profiled_task_slots > self._flags.tpch_max_executors_per_job: + num_slots = self._flags.tpch_max_executors_per_job + runtime = math.ceil( + (profiled_task_slots * profiled_runtime) / self._flags.tpch_max_executors_per_job ) - ) - - if profile["num_tasks"] > self._flags.tpch_max_executors_per_job: self._logger.debug( - "%s@%s: Profiled slots > tpch_max_executors_per_job: %s. Converted " - "(slots,runtime) from (%s,%sms) to (%s, %sms)", + "%s@%s: num_slots (%s) > tpch_max_executors_per_job (%s). Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", node_name, query_name, + profiled_task_slots, self._flags.tpch_max_executors_per_job, - profile["num_tasks"], - profile["avg_task_duration"], - num_tasks, + profiled_task_slots, + profiled_runtime, + num_slots, runtime, ) + else: + num_slots = profiled_task_slots + runtime = profiled_runtime - # convert runtime to us, it is in millseconds - runtime = round(max(self._flags.tpch_min_task_runtime, runtime / 1e3)) + if runtime < self._flags.tpch_min_task_runtime: + _runtime = runtime + runtime = max(self._flags.tpch_min_task_runtime, _runtime) + self._logger.debug( + "%s@%s: runtime (%s) < tpch_min_task_runtime (%s). Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + node_name, + query_name, + _runtime, + self._flags.tpch_min_task_runtime, + num_slots, + _runtime, + num_slots, + runtime, + ) resources = Resources( resource_vector={ - Resource(name="Slot", _id="any"): num_tasks, + Resource(name="Slot", _id="any"): num_slots, }, ) execution_strategies = ExecutionStrategies() @@ -231,90 +276,70 @@ def make_work_profile( execution_strategies=execution_strategies, ) - @staticmethod - def get_profiler_data_for_query( - profile_path: str, query_num: int - ) -> Dict[int, Dict[str, Any]]: - def pre_process_task_duration(task_duration): - # remove fresh durations from first wave - clean_first_wave = {} - for e in task_duration["first_wave"]: - clean_first_wave[e] = [] - fresh_durations = SetWithCount() - for d in task_duration["fresh_durations"][e]: - fresh_durations.add(d) - for d in task_duration["first_wave"][e]: - if d not in fresh_durations: - clean_first_wave[e].append(d) - else: - # prevent duplicated fresh duration blocking first wave - fresh_durations.remove(d) - - task_durations = np.load( - Path(profile_path) / f"task_duration_{query_num}.npy", - allow_pickle=True, - ).item() - - num_nodes = len(task_durations) - - stage_info = {} - - for n in range(num_nodes): - task_duration = task_durations[n] - e = next(iter(task_duration["first_wave"])) - - num_tasks = len(task_duration["first_wave"][e]) + len( - task_duration["rest_wave"][e] - ) - - # remove fresh duration from first wave duration - # drag nearest neighbor first wave duration to empty spots - pre_process_task_duration(task_duration) - rough_duration = np.mean( - [i for t in task_duration["first_wave"].values() for i in t] - + [i for t in task_duration["rest_wave"].values() for i in t] - + [i for t in task_duration["fresh_durations"].values() for i in t] - ) - - curr_stage = { - "stage_id": n, - "num_tasks": num_tasks, - "avg_task_duration": round(rough_duration), # in milliseconds - } - stage_info[n] = curr_stage - - return stage_info - def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: + # Reset rng if this is the first workload. This is to ensure we have + # parity with how jobs are spawned in Spark + if self._current_release_pointer == 0: + self._rng = random.Random(self._rng_seed) + to_release = [] while ( - self._current_release_pointer < len(self._release_times) - and self._release_times[self._current_release_pointer] + self._current_release_pointer < len(self._query_nums_and_release_times) + and self._query_nums_and_release_times[self._current_release_pointer][1] <= current_time + self._workload_update_interval ): - to_release.append(self._release_times[self._current_release_pointer]) + to_release.append( + self._query_nums_and_release_times[self._current_release_pointer] + ) self._current_release_pointer += 1 if ( - self._current_release_pointer >= len(self._release_times) + self._current_release_pointer >= len(self._query_nums_and_release_times) and len(to_release) == 0 ): # Nothing left to release return None - for t in to_release: - query_num = self._rng.randint(1, len(self._job_graphs)) - query_name = f"Q{query_num}" - job_graph = self._job_graphs[query_name] - task_graph = job_graph.get_next_task_graph( + for i, (q, t) in enumerate(to_release): + query_name = f"Q{q}" + task_graph = self._task_graph_generators[query_name]( + idx=i, + current_time=current_time, start_time=t, - _flags=self._flags, ) self._workload.add_task_graph(task_graph) return self._workload +def make_release_policy( + release_policy, release_policy_args, rng, seed, randomize_start_time=(0, 0) +): + # Check that none of the arg values are None + assert all([val is not None for val in release_policy_args.values()]) + + # Construct the release policy + start_time = EventTime( + time=rng.randint(*randomize_start_time), + unit=EventTime.Unit.US, + ) + release_policy = getattr(JobGraph.ReleasePolicy, release_policy)( + start=start_time, rng_seed=seed, **release_policy_args + ) + + return release_policy + + +# TODO: make configurable +TPCH_SUBDIR = "100g/" +DECIMA_TPCH_DIR = ( + "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/decima/" +) +CLOUDLAB_TPCH_DIR = ( + "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/" +) + + class SetWithCount(object): """ allow duplication in set @@ -339,3 +364,104 @@ def remove(self, item): self.set[item] -= 1 if self.set[item] == 0: del self.set[item] + + +def pre_process_task_duration(task_duration): + # remove fresh durations from first wave + clean_first_wave = {} + for e in task_duration["first_wave"]: + clean_first_wave[e] = [] + fresh_durations = SetWithCount() + # O(1) access + for d in task_duration["fresh_durations"][e]: + fresh_durations.add(d) + for d in task_duration["first_wave"][e]: + if d not in fresh_durations: + clean_first_wave[e].append(d) + else: + # prevent duplicated fresh duration blocking first wave + fresh_durations.remove(d) + + +def get_all_stage_info_for_query(query_num, profile_type, dataset_size, max_executors): + stage_info = {} + if profile_type == "Decima": + stage_info = use_decima_tpch_profile(query_num, dataset_size) + elif profile_type == "Cloudlab": + stage_info = use_cloudlab_profile(query_num, dataset_size, max_executors) + else: + raise ValueError(f"Invalid profile type: {profile_type}") + + return stage_info + + +def use_cloudlab_profile(query_num, dataset_size, max_executors): + cloudlab_profile_json = os.path.join( + CLOUDLAB_TPCH_DIR, "cloudlab_22query_tpch_profiles.json" + ) + with open(cloudlab_profile_json, "r") as file: + data = json.load(file) + + query_key_to_extract = ( + "tpch_q" + + str(query_num) + + "_" + + str(dataset_size) + + "g" + + "_maxCores_" + + str(max_executors) + ) + required_query_profile = data[query_key_to_extract] + + stage_info = {} + + for i, stage_profile in enumerate(required_query_profile): + curr_stage = { + "stage_id": i, + "num_tasks": stage_profile["num_tasks"], + "avg_task_duration_ms": round(stage_profile["average_runtime_ms"]), + } + stage_info[i] = curr_stage + + return stage_info + + +def use_decima_tpch_profile(query_num, dataset_size): + task_durations = np.load( + os.path.join( + DECIMA_TPCH_DIR, dataset_size, "task_duration_" + str(query_num) + ".npy" + ), + allow_pickle=True, + ).item() + + num_nodes = len(task_durations) + + stage_info = {} + + for n in range(num_nodes): + task_duration = task_durations[n] + e = next(iter(task_duration["first_wave"])) + # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} + + num_tasks = len(task_duration["first_wave"][e]) + len( + task_duration["rest_wave"][e] + ) + + # remove fresh duration from first wave duration + # drag nearest neighbor first wave duration to empty spots + pre_process_task_duration(task_duration) + rough_duration = np.mean( + [i for t in task_duration["first_wave"].values() for i in t] + + [i for t in task_duration["rest_wave"].values() for i in t] + + [i for t in task_duration["fresh_durations"].values() for i in t] + ) + + # NOTE: Runtime per task is given in milliseconds + curr_stage = { + "stage_id": n, + "num_tasks": num_tasks, + "avg_task_duration_ms": round(rough_duration), + } + stage_info[n] = curr_stage + + return stage_info diff --git a/main.py b/main.py index b4111d33..b0bd0c6e 100644 --- a/main.py +++ b/main.py @@ -142,6 +142,12 @@ 50, "Number of TPC-H queries to run", ) +flags.DEFINE_enum( + "tpch_profile_type", + "Cloudlab", + ["Cloudlab", "Decima"], + "Type of TPC-H profile the data loader must use", +) flags.DEFINE_enum( "tpch_dataset_size", "50", From 06cf4f7dd98a164801a73dc8d1dadb6d6f9a522f Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 10:11:51 -0500 Subject: [PATCH 011/128] restore tpch_utils to main version --- rpc/tpch_utils.py | 86 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 6 deletions(-) diff --git a/rpc/tpch_utils.py b/rpc/tpch_utils.py index 9c1b2e55..ebc4e3cd 100644 --- a/rpc/tpch_utils.py +++ b/rpc/tpch_utils.py @@ -8,17 +8,91 @@ import networkx as nx import numpy as np -from data import TpchLoader - HOME_TPCH_DIR = "../profiles/workload/tpch_decima/" TPCH_SUBDIR = "2g/" +class SetWithCount(object): + """ + allow duplication in set + """ + + def __init__(self): + self.set = {} + + def __contains__(self, item): + return item in self.set + + def add(self, item): + if item in self.set: + self.set[item] += 1 + else: + self.set[item] = 1 + + def clear(self): + self.set.clear() + + def remove(self, item): + self.set[item] -= 1 + if self.set[item] == 0: + del self.set[item] + + +def pre_process_task_duration(task_duration): + # remove fresh durations from first wave + clean_first_wave = {} + for e in task_duration["first_wave"]: + clean_first_wave[e] = [] + fresh_durations = SetWithCount() + # O(1) access + for d in task_duration["fresh_durations"][e]: + fresh_durations.add(d) + for d in task_duration["first_wave"][e]: + if d not in fresh_durations: + clean_first_wave[e].append(d) + else: + # prevent duplicated fresh duration blocking first wave + fresh_durations.remove(d) + + def get_all_stage_info_for_query(query_num): - return TpchLoader.get_profiler_data_for_query( - profile_path=os.path.join(HOME_TPCH_DIR, TPCH_SUBDIR), - query_num=query_num, - ) + task_durations = np.load( + os.path.join( + HOME_TPCH_DIR, TPCH_SUBDIR, "task_duration_" + str(query_num) + ".npy" + ), + allow_pickle=True, + ).item() + + num_nodes = len(task_durations) + + stage_info = {} + + for n in range(num_nodes): + task_duration = task_durations[n] + e = next(iter(task_duration["first_wave"])) + # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} + + num_tasks = len(task_duration["first_wave"][e]) + len( + task_duration["rest_wave"][e] + ) + + # remove fresh duration from first wave duration + # drag nearest neighbor first wave duration to empty spots + pre_process_task_duration(task_duration) + rough_duration = np.mean( + [i for t in task_duration["first_wave"].values() for i in t] + + [i for t in task_duration["rest_wave"].values() for i in t] + + [i for t in task_duration["fresh_durations"].values() for i in t] + ) + + curr_stage = { + "stage_id": n, + "num_tasks": num_tasks, + "avg_task_duration": round(rough_duration), + } + stage_info[n] = curr_stage + + return stage_info def get_base_tpch_graph_structure(query_num): From fcb0180accb73aee72cd261eeb1c299c357e637f Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 10:14:21 -0500 Subject: [PATCH 012/128] split tpch_replay config files --- ...ch_replay.conf => tpch_replay_dsched.conf} | 20 ++++---- configs/tpch_replay_edf.conf | 47 +++++++++++++++++++ 2 files changed, 55 insertions(+), 12 deletions(-) rename configs/{tpch_replay.conf => tpch_replay_dsched.conf} (68%) create mode 100644 configs/tpch_replay_edf.conf diff --git a/configs/tpch_replay.conf b/configs/tpch_replay_dsched.conf similarity index 68% rename from configs/tpch_replay.conf rename to configs/tpch_replay_dsched.conf index cb0dc26f..1b839546 100644 --- a/configs/tpch_replay.conf +++ b/configs/tpch_replay_dsched.conf @@ -1,20 +1,15 @@ # Output configs. ---log=./tpch_replay.log +--log=./tpch_replay_dsched.log --log_level=debug ---csv=./tpch_replay.csv +--csv=./tpch_replay_dsched.csv # Task configs. --runtime_variance=0 # Scheduler configs. -# EDF -# --scheduler=EDF -# --scheduler_runtime=0 -# --enforce_deadlines - # DSched ---scheduler=TetriSched_Gurobi +--scheduler=TetriSched --scheduler_runtime=0 --enforce_deadlines --retract_schedules @@ -23,8 +18,8 @@ --scheduler_time_discretization=1 # Deadline variance ---min_deadline_variance=25 ---max_deadline_variance=50 +--min_deadline_variance=10 +--max_deadline_variance=25 # Execution mode configs. --execution_mode=replay @@ -33,10 +28,11 @@ # Release time config. --override_release_policy=gamma --override_gamma_coefficient=1 ---override_poisson_arrival_rate=0.01 ---override_num_invocation=1 +--override_poisson_arrival_rate=1 +--override_num_invocation=10 # TPCH flags --random_seed=1234 --tpch_query_dag_spec=profiles/workload/tpch/queries.yaml +--tpch_dataset_size=50 --worker_profile_path=profiles/workers/tpch_cluster.yaml diff --git a/configs/tpch_replay_edf.conf b/configs/tpch_replay_edf.conf new file mode 100644 index 00000000..cf23650a --- /dev/null +++ b/configs/tpch_replay_edf.conf @@ -0,0 +1,47 @@ +# Output configs. +# --log=./tpch_replay_dsched.log +# --log_level=debug +# --csv=./tpch_replay_dsched.csv + +--log=./tpch_replay_edf.log +--log_level=debug +--csv=./tpch_replay_edf.csv + +# Task configs. +--runtime_variance=0 + +# Scheduler configs. + +# EDF +--scheduler=EDF +--scheduler_runtime=0 +--enforce_deadlines + +# DSched +# --scheduler=TetriSched +# --scheduler_runtime=0 +# --enforce_deadlines +# --retract_schedules +# --release_taskgraphs +# --drop_skipped_tasks +# --scheduler_time_discretization=1 + +# Deadline variance +--min_deadline_variance=10 +--max_deadline_variance=25 + +# Execution mode configs. +--execution_mode=replay +--replay_trace=tpch + +# Release time config. +--override_release_policy=gamma +--override_gamma_coefficient=1 +--override_poisson_arrival_rate=1 +--override_num_invocation=10 + +# TPCH flags +--random_seed=1234 +--tpch_query_dag_spec=profiles/workload/tpch/queries.yaml +--tpch_dataset_size=50 +--worker_profile_path=profiles/workers/tpch_cluster.yaml From 9014090f1d830816ec09cc83b626f50912489a68 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 10:15:24 -0500 Subject: [PATCH 013/128] remove opt_passes flag --- main.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/main.py b/main.py index b0bd0c6e..4fb27502 100644 --- a/main.py +++ b/main.py @@ -165,14 +165,6 @@ "Minimum runtime of a TPC-H task", ) -flags.DEFINE_list( - "opt_passes", - [], - "A list of timestamps (in µs) at which to request extra logging from the Scheduler." - "If scheduler_log_to_file is `True`, then extra information will be requested for " - "all timestamps.", -) - # AlibabaLoader related flags. flags.DEFINE_integer( "alibaba_loader_task_cpu_multiplier", From 86420f3d8fdfa20c55093a9fa335ce57a8b54d04 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 10:22:41 -0500 Subject: [PATCH 014/128] update tpch_utils.py --- rpc/tpch_utils.py | 115 +++++++++++----------------------------------- 1 file changed, 27 insertions(+), 88 deletions(-) diff --git a/rpc/tpch_utils.py b/rpc/tpch_utils.py index ebc4e3cd..48b28f83 100644 --- a/rpc/tpch_utils.py +++ b/rpc/tpch_utils.py @@ -2,106 +2,45 @@ import ast import json +import yaml import os from typing import Mapping, Sequence import networkx as nx import numpy as np -HOME_TPCH_DIR = "../profiles/workload/tpch_decima/" -TPCH_SUBDIR = "2g/" +from data.tpch_loader import get_all_stage_info_for_query -class SetWithCount(object): - """ - allow duplication in set - """ - - def __init__(self): - self.set = {} - - def __contains__(self, item): - return item in self.set - - def add(self, item): - if item in self.set: - self.set[item] += 1 - else: - self.set[item] = 1 - - def clear(self): - self.set.clear() - - def remove(self, item): - self.set[item] -= 1 - if self.set[item] == 0: - del self.set[item] - - -def pre_process_task_duration(task_duration): - # remove fresh durations from first wave - clean_first_wave = {} - for e in task_duration["first_wave"]: - clean_first_wave[e] = [] - fresh_durations = SetWithCount() - # O(1) access - for d in task_duration["fresh_durations"][e]: - fresh_durations.add(d) - for d in task_duration["first_wave"][e]: - if d not in fresh_durations: - clean_first_wave[e].append(d) - else: - # prevent duplicated fresh duration blocking first wave - fresh_durations.remove(d) - - -def get_all_stage_info_for_query(query_num): - task_durations = np.load( - os.path.join( - HOME_TPCH_DIR, TPCH_SUBDIR, "task_duration_" + str(query_num) + ".npy" - ), - allow_pickle=True, - ).item() - - num_nodes = len(task_durations) - - stage_info = {} - - for n in range(num_nodes): - task_duration = task_durations[n] - e = next(iter(task_duration["first_wave"])) - # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} - - num_tasks = len(task_duration["first_wave"][e]) + len( - task_duration["rest_wave"][e] - ) - - # remove fresh duration from first wave duration - # drag nearest neighbor first wave duration to empty spots - pre_process_task_duration(task_duration) - rough_duration = np.mean( - [i for t in task_duration["first_wave"].values() for i in t] - + [i for t in task_duration["rest_wave"].values() for i in t] - + [i for t in task_duration["fresh_durations"].values() for i in t] - ) - - curr_stage = { - "stage_id": n, - "num_tasks": num_tasks, - "avg_task_duration": round(rough_duration), - } - stage_info[n] = curr_stage - - return stage_info +TPCH_PARENT_DIR = "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/" def get_base_tpch_graph_structure(query_num): - # use query_num to read string from file - with open(os.path.join(HOME_TPCH_DIR, "query_dag.json")) as f: - tpch_query_json = json.load(f) + with open(os.path.join(TPCH_PARENT_DIR, "queries.yaml")) as f: + tpch_query_yaml = yaml.load(f, Loader=yaml.FullLoader) + + # Extract the graph structure for the given query number + query_graph = None + for graph in tpch_query_yaml["graphs"]: + if graph["name"] == f"Q{query_num}": + query_graph = graph["graph"] + break + + if query_graph is None: + raise ValueError(f"Query number {query_num} not found in the YAML file") + + # Convert the graph structure to a format suitable for nx.DiGraph + query_dependency = [] + for node in query_graph: + if "children" in node: + for child in node["children"]: + query_dependency.append((node["name"], child)) + else: + # Ensure each tuple has two elements by adding a dummy node + query_dependency.append((node["name"], None)) - # get query dependency from file - query_dependency = ast.literal_eval(tpch_query_json["query_number"][str(query_num)]) + # Remove any tuples where the second element is None + query_dependency = [edge for edge in query_dependency if edge[1] is not None] # convert job structure into a nx graph base_tpch_graph = nx.DiGraph(query_dependency) From 2f09d5ec07e34d34f4aa97c549772129927a8dc3 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 13:23:23 -0500 Subject: [PATCH 015/128] setup new service.py --- rpc/service.py | 848 ++----------------------------------------- rpc/service_old.py | 874 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 899 insertions(+), 823 deletions(-) create mode 100644 rpc/service_old.py diff --git a/rpc/service.py b/rpc/service.py index 2aaa2dc9..2f048067 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,39 +1,13 @@ import asyncio -import heapq -import os -import sys -import time -from collections import defaultdict from concurrent import futures -from operator import attrgetter -from typing import Mapping, Sequence -from urllib.parse import urlparse -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) -) import erdos_scheduler_pb2 import erdos_scheduler_pb2_grpc + import grpc -from absl import app, flags -from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph -from schedulers import EDFScheduler, FIFOScheduler -from utils import EventTime, setup_logging -from workers import Worker, WorkerPool, WorkerPools -from workload import ( - ExecutionStrategies, - ExecutionStrategy, - Job, - Placement, - Resource, - Resources, - Task, - TaskGraph, - Workload, - WorkProfile, -) +from absl import app, flags FLAGS = flags.FLAGS @@ -41,831 +15,59 @@ flags.DEFINE_integer( "max_workers", 10, "Maximum number of workers to use for the RPC server." ) -flags.DEFINE_string("log_file", None, "Path to the log file.", short_name="log") -flags.DEFINE_string("log_level", "debug", "The level to log.") -flags.DEFINE_integer( - "initial_executors", - 10, - "The initial number of executors that are requested by each application.", -) -flags.DEFINE_integer( - "virtualized_cores", - 500, - "The number of virtualized cores that must be created in each Worker on the " - "framework. This allows us to spawn a higher number of executors than the number " - "possible with actual available resources. Thus, we can spawn the executors for " - "each application, and only selectively activate them according to the actual " - "available resources.", -) -flags.DEFINE_integer( - "virtualized_memory", - 500, - "The amount of virtualized memory (in GB) that must be created in each Worker on " - "the framework. Refer to the `virtualized_cores` flag for more information.", -) -flags.DEFINE_enum( - "scheduler", "EDF", ["FIFO", "EDF"], "The scheduler to use for this execution." -) -# Define an item containing completion timestamp and task -class TimedItem: - def __init__(self, timestamp, task): - self.timestamp = timestamp - self.task = task - - -# Define a priority queue based on heapq module -class PriorityQueue: - def __init__(self): - self._queue = [] - - def put(self, item): - heapq.heappush(self._queue, (item.timestamp, item)) - - def get(self): - _, item = heapq.heappop(self._queue) - return item - - def empty(self): - return len(self._queue) == 0 - - -# Implement the service. -class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): +class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): def __init__(self) -> None: - """Initialize the service, and setup the logger.""" - # Values used by the Servicer. - self._logger = setup_logging(name=FLAGS.log_file, log_level=FLAGS.log_level) - self._initialized = False - self._initialization_time = -1 - self._master_uri = None - - # The simulator types maintained by the Servicer. - self._worker_pool = None - self._worker_pools = None - self._drivers: Mapping[str, Task] = {} - self._workload = None - - # Scheduler information maintained by the servicer. - self._scheduler_running_lock = asyncio.Lock() - self._scheduler_running = False - self._rerun_scheduler = False - if FLAGS.scheduler == "EDF": - self._scheduler = EDFScheduler() - elif FLAGS.scheduler == "FIFO": - self._scheduler = FIFOScheduler() - else: - raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") - - # Placement information maintained by the servicer. - # The placements map the application IDs to the Placement retrieved from the - # scheduler. The placements are automatically clipped at the time of informing - # the framework of applying them to the executors. - # NOTE (Sukrit): This must always be sorted by the Placement time. - self._placements: Mapping[str, Sequence[Placement]] = defaultdict(list) - - # Additional task information maintained by the servicer - self._tasks_marked_for_completion = PriorityQueue() - - # Start the asyncio loop for clearing out pending tasks for completion - asyncio.create_task(self.PopTasksBasedOnTime()) - - super().__init__() - - async def schedule(self) -> None: - """Schedules the tasks that have been added to the Workload.""" - async with self._scheduler_running_lock: - if self._scheduler_running: - self._logger.error( - "Scheduler already running, this should never be reached." - ) - return - self._scheduler_running = True - - current_time = EventTime(int(time.time()), EventTime.Unit.S) - self._logger.info( - "Starting a scheduling cycle with %s TaskGraphs and %s Workers at %s.", - len(self._workload.task_graphs), - len(self._worker_pool.workers), - current_time, - ) - - # TODO (Sukrit): Change this to a better implementation. - # Let's do some simple scheduling for now, that gives a fixed number of - # executors to all the available applications in intervals of 10 seconds. - if len(self._workload.task_graphs) >= 2: - placements = self._scheduler.schedule( - sim_time=current_time, - workload=self._workload, - worker_pools=self._worker_pools, - ) - # Filter the placements that are not of type PLACE_TASK and that have not - # been placed. - filtered_placements = filter( - lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK - and p.is_placed(), - placements, - ) - for placement in sorted( - filtered_placements, key=attrgetter("placement_time") - ): - self._placements[placement.task.task_graph].append(placement) - # Schedule the task here since marking it as running requires it to be - # scheduled before. We mark it to be running when we inform the - # framework of the placement. - placement.task.schedule( - time=placement.placement_time, - placement=placement, - ) - - self._logger.info( - "Finished the scheduling cycle initiated at %s.", current_time - ) - - # Check if another run of the Scheduler has been requested, and if so, create - # a task for it. Otherwise, mark the scheduler as not running. - async with self._scheduler_running_lock: - self._scheduler_running = False - if self._rerun_scheduler: - self._rerun_scheduler = False - asyncio.create_task(self.schedule()) - - async def run_scheduler(self) -> None: - """Checks if the scheduler is running, and if not, starts it. - - If the scheduler is already running, we queue up another execution of the - scheduler. This execution batches the scheduling requests, and runs the - scheduler only once for all the requests.""" - async with self._scheduler_running_lock: - if not self._scheduler_running: - asyncio.create_task(self.schedule()) - else: - self._rerun_scheduler = True + pass async def RegisterFramework(self, request, context): - """Registers a new framework with the backend scheduler. - This is the entry point for a new instance of Spark / Flink to register - itself with the backend scheduler, and is intended as an EHLO. - """ - if self._initialized: - self._logger.warning( - "Framework already registered at %s with the address %s", - self._initialization_time, - self._master_uri, - ) - return erdos_scheduler_pb2.RegisterFrameworkResponse( - success=False, - message=f"Framework already registered at " - f"{self._initialization_time} at the address {self._master_uri}", - ) - - # Setup a new Framework instance. - framework_name = request.name - self._master_uri = request.uri - self._initialization_time = request.timestamp - self._initialized = True - self._logger.info( - "Registering framework %s with URI %s at %s", - framework_name, - self._master_uri, - self._initialization_time, - ) - - # Setup the simulator types. - parsed_uri = urlparse(self._master_uri) - self._worker_pool = WorkerPool(name=f"WorkerPool_{parsed_uri.netloc}") - self._worker_pools = WorkerPools(worker_pools=[self._worker_pool]) - self._workload = Workload.from_task_graphs({}) + pass - # Return the response. - return erdos_scheduler_pb2.RegisterFrameworkResponse( - success=True, - message=f"{framework_name} at {self._master_uri} registered successfully!", - ) + async def DeregisterFramework(self, request, context): + pass async def RegisterDriver(self, request, context): - if not self._initialized: - self._logger.warning( - "Trying to register a driver with name %s and id %s, " - "but no framework is registered yet.", - request.name, - request.id, - ) - return erdos_scheduler_pb2.RegisterDriverResponse( - success=False, - message="Framework not registered yet.", - worker_id="", - ) - - # Create a Task for the Driver, and add it to the list of drivers. - # TODO (Sukrit): We drop the memory requirements for now, we should use - # them to do multi-dimensional packing using STRL. - self._logger.info( - "Received a request to register a driver with name %s, URI: %s. " - "The driver requires %s cores and %s memory.", - request.id, - request.uri, - request.cores, - request.memory, - ) - driver_resources = Resources( - resource_vector={Resource(name="Slot_CPU", _id="any"): 1} - ) - driver_job = Job( - name=request.id, - profile=WorkProfile( - name=f"WorkProfile_{request.id}", - execution_strategies=ExecutionStrategies( - [ - ExecutionStrategy( - resources=driver_resources, - batch_size=1, - # NOTE (Sukrit): Drivers are long running, and have no - # fixed runtime. Setting it to zero helps us unload the - # driver from the Worker whenever we need it. - runtime=EventTime.zero(), - ) - ] - ), - ), - ) - driver = Task( - name=request.id, - task_graph=request.uri, - job=driver_job, - deadline=EventTime.invalid(), - ) - self._drivers[request.id] = driver - - # Iterate over the Workers and find a Worker that can accomodate the driver. - placement_found = False - for worker in self._worker_pool.workers: - for execution_strategy in driver.available_execution_strategies: - if worker.can_accomodate_strategy(execution_strategy): - # This Worker can accomodate the Driver, we assign it here. - placement_found = True - self._worker_pool.place_task(driver, execution_strategy, worker.id) - - # Update the Task's state and placement information. - placement_time = EventTime(request.timestamp, EventTime.Unit.S) - driver.schedule( - time=placement_time, - placement=Placement( - type=Placement.PlacementType.PLACE_TASK, - computation=driver, - placement_time=placement_time, - worker_pool_id=self._worker_pool.id, - worker_id=worker.id, - strategy=execution_strategy, - ), - ) - driver.start(placement_time) - - # Tell the framework to start the driver. - return erdos_scheduler_pb2.RegisterDriverResponse( - success=True, - message=f"Driver {request.id} registered successfully!", - worker_id=worker.name, - ) - - if not placement_found: - return erdos_scheduler_pb2.RegisterDriverResponse( - success=False, - message=f"No Worker can accomodate the driver {request.id} yet.", - worker_id="", - ) + pass async def DeregisterDriver(self, request, context): - if not self._initialized: - self._logger.warning( - "Trying to deregister a driver with id %s, " - "but no framework is registered yet.", - request.id, - ) - return erdos_scheduler_pb2.DeregisterDriverResponse( - success=False, message="Framework not registered yet." - ) - - if request.id not in self._drivers: - self._logger.warning( - "Trying to deregister a driver with id %s, " - "but no driver with that id is registered.", - request.id, - ) - return erdos_scheduler_pb2.DeregisterDriverResponse( - success=False, - message=f"Driver with id {request.id} not registered yet.", - ) - - # Deregister the driver. - driver = self._drivers[request.id] - completion_time = EventTime(request.timestamp, EventTime.Unit.S) - self._worker_pool.remove_task(completion_time, driver) - driver.finish(completion_time) - del self._drivers[request.id] - return erdos_scheduler_pb2.DeregisterDriverResponse( - success=True, - message=f"Driver with id {request.id} deregistered successfully!", - ) + pass async def RegisterTaskGraph(self, request, context): - """Registers a new TaskGraph with the backend scheduler. - This is the entry point for a new application of Spark to register - itself with the backend scheduler, and is intended as an EHLO. - """ - if not self._initialized: - self._logger.warning( - "Trying to register a task graph with ID %s and name %s, " - "but no framework is registered yet.", - request.id, - request.name, - ) - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, message="Framework not registered yet.", num_executors=0 - ) - - if request.id in self._workload.task_graphs: - self._logger.warning( - "The application with ID %s and name %s was already registered.", - request.id, - request.name, - ) - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, - message=f"Application ID {request.id} with name {request.name} " - f"already registered!", - num_executors=0, - ) - - self._logger.info( - "Attempting to register application ID %s with name %s", - request.id, - request.name, - ) - # Check if query is from TPC-H workload. - # If yes, retrieve profiled slots and runtime info. If no, use default values - is_tpch_query = False - tpch_query_all_stage_info = None - if request.name.startswith("TPCH_"): - is_tpch_query = True - # retrieve tasks-per-stage and runtime info based on query number - tpch_query_num = request.name.split("TPCH_Q", 1)[1] - tpch_query_all_stage_info = get_all_stage_info_for_query(tpch_query_num) - same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph( - query_num=tpch_query_num, dependencies=request.dependencies - ) - - # return failure message if not tpch app isnt of same DAG structure - if not same_structure: - self._logger.warning( - "TPCH application with ID %s and name %s couldn't be registered." - "DAG structure mismatch!", - request.id, - request.name, - ) - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, - message=f"TPCH application ID {request.id} with name {request.name}" - f" couldn't be registered. DAG structure mismatch!", - num_executors=0, - ) - - # Construct all the Tasks for the TaskGraph. - task_ids_to_task: Mapping[int, Task] = {} - default_resource = Resources( - resource_vector={Resource(name="Slot_CPU", _id="any"): 20} - ) - default_runtime = EventTime(20, EventTime.Unit.US) - - for task_dependency in request.dependencies: - framework_task = task_dependency.key - if is_tpch_query: - mapped_stage_id = stage_id_mapping[framework_task.id] - task_slots = tpch_query_all_stage_info[mapped_stage_id]["num_tasks"] - task_runtime = tpch_query_all_stage_info[mapped_stage_id][ - "avg_task_duration" - ] - self._logger.info( - "Creating Task for given app TPCH stage: %s, mapped to " - "original stage id %s, with tasks: %s and avg runtime: %s", - framework_task.id, - mapped_stage_id, - task_slots, - task_runtime, - ) - task_ids_to_task[framework_task.id] = Task( - name=framework_task.name, - task_graph=request.id, - job=Job( - name=framework_task.name, - profile=WorkProfile( - name=f"WorkProfile_{framework_task.name}", - execution_strategies=ExecutionStrategies( - [ - ExecutionStrategy( - resources=( - default_resource - if not is_tpch_query - else Resources( - resource_vector={ - Resource( - name="Slot_CPU", _id="any" - ): task_slots - } - ) - ), - batch_size=1, - runtime=( - default_runtime - if not is_tpch_query - else EventTime(task_runtime, EventTime.Unit.US) - ), - ) - ] - ), - ), - ), - deadline=EventTime(request.deadline, EventTime.Unit.S), - # TODO (Sukrit): We should maintain a counter for each application - # type so that we can correlate the Tasks with a particular invocation. - timestamp=1, - ) - # NOTE (Sukrit): We maintain the StageID of the Task as a separate field - # that is not accessible / used by the Simulator. - task_ids_to_task[framework_task.id].stage_id = framework_task.id - self._logger.info( - "Constructed Task %s for the TaskGraph %s.", - framework_task.name, - request.id, - ) - - # Construct the TaskGraph from the Tasks. - task_graph_structure: Mapping[Task, Sequence[Task]] = {} - for task_dependency in request.dependencies: - task_graph_structure[task_ids_to_task[task_dependency.key.id]] = [ - task_ids_to_task[task_id] for task_id in task_dependency.children_ids - ] - task_graph = TaskGraph( - name=request.id, - tasks=task_graph_structure, - ) - self._workload.add_task_graph(task_graph) - self._logger.info( - "Added the TaskGraph(name=%s, id=%s) to the Workload.", - request.name, - request.id, - ) - self._logger.info( - "The structure of the TaskGraph %s is \n%s.", - request.id, - str(task_graph), - ) - - # Return the response. - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=True, - message=f"Application ID {request.id} with name " - f"{request.name} and deadline {request.deadline} registered successfully!", - num_executors=FLAGS.initial_executors, - ) + pass async def RegisterEnvironmentReady(self, request, context): - """Registers that the environment (i.e., executors) are ready for the given - TaskGraph at the specified time. - - This is intended to release the sources of the TaskGraph to the scheduling - backend, to consider the application in this scheduling cycle. - """ - if not self._initialized: - self._logger.warning( - "Trying to register that the environment is ready for the TaskGraph " - "with ID %s, but no framework is registered yet.", - request.id, - ) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, message="Framework not registered yet." - ) - - task_graph = self._workload.get_task_graph(request.id) - if task_graph is None: - self._logger.warning( - "Trying to register that the environment is ready for the TaskGraph " - "with ID %s, but no TaskGraph with that ID is registered.", - request.id, - ) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, - message=f"TaskGraph with ID {request.id} not registered yet.", - ) - - if request.num_executors != FLAGS.initial_executors: - self._logger.warning( - "The TaskGraph %s requires %s executors, but the environment is ready " - "with %s executors.", - request.id, - FLAGS.initial_executors, - request.num_executors, - ) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, - message=f"Number of executors not {FLAGS.initial_executors}.", - ) - - # Release all the sources of the TaskGraph at the given time. - for source_task in task_graph.get_source_tasks(): - source_task.release(EventTime(request.timestamp, EventTime.Unit.S)) - - # Run the scheduler since the Workload has changed. - await self.run_scheduler() - - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=True, - message=f"Environment ready for TaskGraph with ID {request.id}!", - ) - - async def DeregisterFramework(self, request, context): - """Deregisters the framework with the backend scheduler. - This is the exit point for a running instance of Spark / Flink to deregister""" - if not self._initialized: - self._logger.warning( - "Trying to deregister the framework at %s, " - "but no framework is registered yet.", - request.uri, - ) - return erdos_scheduler_pb2.DeregisterFrameworkResponse( - success=False, message="Framework not registered yet." - ) - - if not self._master_uri == request.uri: - self._logger.warning( - "Trying to deregister the framework at %s, " - "but the registered framework is at %s.", - request.uri, - self._master_uri, - ) - return erdos_scheduler_pb2.DeregisterFrameworkResponse( - success=False, - message=f"Framework not registered at {request.uri} yet.", - ) - - # Deregister the framework. - self._initialization_time = None - self._master_uri = None - self._initialized = False - self._logger.info("Deregistering framework at %s", request.uri) - return erdos_scheduler_pb2.DeregisterFrameworkResponse( - success=True, - message=f"Framework at {request.uri} deregistered successfully!", - ) + pass async def RegisterWorker(self, request, context): - """Registers a new worker with the backend scheduler.""" - if not self._initialized: - self._logger.warning( - "Trying to register a worker with name %s and id %s, " - "but no framework is registered yet.", - request.name, - request.id, - ) - return erdos_scheduler_pb2.RegisterWorkerResponse( - success=False, message="Framework not registered yet." - ) - - # First, we construct the Resources with the given size. - # TODO (Sukrit): Right now, we drop the memory requirements, we should use - # them to do multi-dimensional packing using STRL. - cpu_resource = Resource(name="Slot_CPU") - worker_resources = Resources(resource_vector={cpu_resource: request.cores}) - self._logger.debug( - "Successfully constructed the resources for the worker %s: %s.", - request.name, - worker_resources, - ) - - # Construct a new Worker instance, and add it to the WorkerPool. - worker = Worker( - name=request.id, - resources=worker_resources, - ) - self._worker_pool.add_workers([worker]) - - self._logger.info( - "Registering worker with name %s, and resources %s.", - worker.name, - worker_resources, - ) - - # Run the scheduler since the Resource set has changed, and new task graphs - # may become eligible to run. - await self.run_scheduler() - - return erdos_scheduler_pb2.RegisterWorkerResponse( - success=True, - message=f"Worker {request.name} registered successfully!", - cores=FLAGS.virtualized_cores, - memory=FLAGS.virtualized_memory * 1024, - ) - - async def NotifyTaskCompletion(self, request, context): - """Notifies the backend scheduler that a task has completed.""" - if not self._initialized: - self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " - "from application %s has completed, " - "but no framework is registered yet.", - request.task_id, - request.application_id, - ) - return erdos_scheduler_pb2.NotifyTaskCompletionResponse( - success=False, message="Framework not registered yet." - ) - - task_graph = self._workload.get_task_graph(request.application_id) - if task_graph is None: - self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " - "from application %s has completed, but the application " - "was not registered with the backend yet.", - request.task_id, - request.application_id, - ) - return erdos_scheduler_pb2.NotifyTaskCompletionResponse( - success=False, - message=f"Application with ID {request.application_id} " - f"not registered yet.", - ) - - # Find the Task that has completed, and mark it as such. - matched_task = None - for task in task_graph.get_nodes(): - if task.stage_id == request.task_id: - matched_task = task - if matched_task is None: - self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " - "from application %s has completed, but the task " - "was not found in the TaskGraph.", - request.task_id, - request.application_id, - ) - return erdos_scheduler_pb2.NotifyTaskCompletionResponse( - success=False, - message=f"Task with ID {request.task_id} " - f"not found in TaskGraph {request.application_id}.", - ) - - # Instead of completing & removing the task immediately, check - # if it is actually complete or will complete in the future - - # Get the actual task completion timestamp - actual_task_completion_time = ( - matched_task.start_time.time + matched_task.remaining_time.time - ) - - current_time = time.time() - self._logger.info( - "Received task for completion at time: %s , task.start_time: %s ," - "task.remaining_time (=runtime): %s , actual completion time: %s ", - round(current_time), - matched_task.start_time.time, - matched_task.remaining_time.time, - actual_task_completion_time, - ) - - # TODO DG: remaining_time assumes execution of the slowest strategy - # Should be updated to reflect correct remaining_time based on chosen strategy? - - # Add all tasks to _tasks_marked_for_completion queue. - # If task has actually completed, it will be dequeued immediately - # Else it will be dequeued at its actual task completion time - self._tasks_marked_for_completion.put( - TimedItem(actual_task_completion_time, matched_task) - ) - - # NOTE: task.finish() and run_scheduler() invocations are postponed - # until it is time for the task to be actually marked as complete. - - return erdos_scheduler_pb2.NotifyTaskCompletionResponse( - success=True, - message=f"Task with ID {request.task_id} marked for completion at " - f"{round(current_time)}! It will be removed on actual " - f"task completion time at {actual_task_completion_time}", - ) + pass async def GetPlacements(self, request, context): - """Retrieves the placements applicable at the specified time.""" - request_timestamp = EventTime(request.timestamp, EventTime.Unit.S) - if not self._initialized: - self._logger.warning( - "Trying to get placements for %s at time %s, " - "but no framework is registered yet.", - request.id, - request_timestamp, - ) - return erdos_scheduler_pb2.GetPlacementsResponse( - success=False, message="Framework not registered yet." - ) - - if request.id not in self._placements: - self._logger.warning( - "Trying to get placements for %s at time %s, but the application " - "was not registered with the backend yet.", - request.id, - request_timestamp, - ) - - # Construct and return the placements., - placements = [] - clip_at = -1 - for index, placement in enumerate(self._placements[request.id]): - if placement.placement_time <= request_timestamp: - clip_at = index - # Mark the Task as RUNNING. - placement.task.start(request_timestamp) - - # resources = placement.execution_strategy.resources - placements.append( - erdos_scheduler_pb2.Placement( - worker_id=placement.worker_id, - application_id=request.id, - task_id=placement.task.stage_id, - cores=1, - ) - ) - self._placements[request.id] = self._placements[request.id][clip_at + 1 :] - self._logger.info( - "Constructed %s placements at time %s for application with ID %s.", - len(placements), - request.timestamp, - request.id, - ) - return erdos_scheduler_pb2.GetPlacementsResponse( - success=True, - placements=placements, - message=f"Constructed {len(placements)} " - f"placements at time {request.timestamp}.", - ) + pass - # Function to pop tasks from queue based on actual completion time - async def PopTasksBasedOnTime(self): - while True: - if not self._tasks_marked_for_completion.empty(): - # Get the top item from the priority queue - top_item = self._tasks_marked_for_completion._queue[0][1] - - # Check if top item's timestamp is reached or passed by current time - current_time = time.time() - if top_item.timestamp <= current_time: - # Pop the top item - popped_item = self._tasks_marked_for_completion.get() - self._logger.info( - "Removing tasks from pending completion queue: %s at time: %s", - popped_item.task, - current_time, - ) + async def NotifyTaskCompletion(self, request, context): + pass - # Mark the Task as completed. - # Also release the task from the scheduler service - popped_item.task.update_remaining_time(EventTime.zero()) - popped_item.task.finish( - EventTime(round(current_time), EventTime.Unit.S) - ) - # Run the scheduler since the Workload has changed. - await self.run_scheduler() +async def serve(server): + await server.start() + print("Initialized ERDOS RPC Service on port", FLAGS.port) + await server.wait_for_termination() - else: - # If the top item's timestamp hasn't been reached yet, - # sleep for a short duration - await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s - else: - # If the queue is empty, sleep for a short duration - await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s +def main(_argv): + loop = asyncio.get_event_loop() -async def serve(): - """Serves the ERDOS Scheduling RPC Server.""" - # Initialize the server. server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server( - SchedulerServiceServicer(), server + Servicer(), server ) - - # Start the server. server.add_insecure_port(f"[::]:{FLAGS.port}") - await server.start() - print("Initialized ERDOS Scheduling RPC Server on port", FLAGS.port) - await server.wait_for_termination() - - -def main(argv): - # Create an asyncio event loop - loop = asyncio.get_event_loop() - # Run the event loop until serve() completes try: - loop.run_until_complete(serve()) + loop.run_until_complete(serve(server)) + except KeyboardInterrupt: + print("Terminated ERDOS RPC Service") finally: loop.close() diff --git a/rpc/service_old.py b/rpc/service_old.py new file mode 100644 index 00000000..2aaa2dc9 --- /dev/null +++ b/rpc/service_old.py @@ -0,0 +1,874 @@ +import asyncio +import heapq +import os +import sys +import time +from collections import defaultdict +from concurrent import futures +from operator import attrgetter +from typing import Mapping, Sequence +from urllib.parse import urlparse + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) + +import erdos_scheduler_pb2 +import erdos_scheduler_pb2_grpc +import grpc +from absl import app, flags +from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph + +from schedulers import EDFScheduler, FIFOScheduler +from utils import EventTime, setup_logging +from workers import Worker, WorkerPool, WorkerPools +from workload import ( + ExecutionStrategies, + ExecutionStrategy, + Job, + Placement, + Resource, + Resources, + Task, + TaskGraph, + Workload, + WorkProfile, +) + +FLAGS = flags.FLAGS + +flags.DEFINE_integer("port", 50051, "Port to serve the ERDOS Scheduling RPC Server on.") +flags.DEFINE_integer( + "max_workers", 10, "Maximum number of workers to use for the RPC server." +) +flags.DEFINE_string("log_file", None, "Path to the log file.", short_name="log") +flags.DEFINE_string("log_level", "debug", "The level to log.") +flags.DEFINE_integer( + "initial_executors", + 10, + "The initial number of executors that are requested by each application.", +) +flags.DEFINE_integer( + "virtualized_cores", + 500, + "The number of virtualized cores that must be created in each Worker on the " + "framework. This allows us to spawn a higher number of executors than the number " + "possible with actual available resources. Thus, we can spawn the executors for " + "each application, and only selectively activate them according to the actual " + "available resources.", +) +flags.DEFINE_integer( + "virtualized_memory", + 500, + "The amount of virtualized memory (in GB) that must be created in each Worker on " + "the framework. Refer to the `virtualized_cores` flag for more information.", +) +flags.DEFINE_enum( + "scheduler", "EDF", ["FIFO", "EDF"], "The scheduler to use for this execution." +) + + +# Define an item containing completion timestamp and task +class TimedItem: + def __init__(self, timestamp, task): + self.timestamp = timestamp + self.task = task + + +# Define a priority queue based on heapq module +class PriorityQueue: + def __init__(self): + self._queue = [] + + def put(self, item): + heapq.heappush(self._queue, (item.timestamp, item)) + + def get(self): + _, item = heapq.heappop(self._queue) + return item + + def empty(self): + return len(self._queue) == 0 + + +# Implement the service. +class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): + def __init__(self) -> None: + """Initialize the service, and setup the logger.""" + # Values used by the Servicer. + self._logger = setup_logging(name=FLAGS.log_file, log_level=FLAGS.log_level) + self._initialized = False + self._initialization_time = -1 + self._master_uri = None + + # The simulator types maintained by the Servicer. + self._worker_pool = None + self._worker_pools = None + self._drivers: Mapping[str, Task] = {} + self._workload = None + + # Scheduler information maintained by the servicer. + self._scheduler_running_lock = asyncio.Lock() + self._scheduler_running = False + self._rerun_scheduler = False + if FLAGS.scheduler == "EDF": + self._scheduler = EDFScheduler() + elif FLAGS.scheduler == "FIFO": + self._scheduler = FIFOScheduler() + else: + raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") + + # Placement information maintained by the servicer. + # The placements map the application IDs to the Placement retrieved from the + # scheduler. The placements are automatically clipped at the time of informing + # the framework of applying them to the executors. + # NOTE (Sukrit): This must always be sorted by the Placement time. + self._placements: Mapping[str, Sequence[Placement]] = defaultdict(list) + + # Additional task information maintained by the servicer + self._tasks_marked_for_completion = PriorityQueue() + + # Start the asyncio loop for clearing out pending tasks for completion + asyncio.create_task(self.PopTasksBasedOnTime()) + + super().__init__() + + async def schedule(self) -> None: + """Schedules the tasks that have been added to the Workload.""" + async with self._scheduler_running_lock: + if self._scheduler_running: + self._logger.error( + "Scheduler already running, this should never be reached." + ) + return + self._scheduler_running = True + + current_time = EventTime(int(time.time()), EventTime.Unit.S) + self._logger.info( + "Starting a scheduling cycle with %s TaskGraphs and %s Workers at %s.", + len(self._workload.task_graphs), + len(self._worker_pool.workers), + current_time, + ) + + # TODO (Sukrit): Change this to a better implementation. + # Let's do some simple scheduling for now, that gives a fixed number of + # executors to all the available applications in intervals of 10 seconds. + if len(self._workload.task_graphs) >= 2: + placements = self._scheduler.schedule( + sim_time=current_time, + workload=self._workload, + worker_pools=self._worker_pools, + ) + # Filter the placements that are not of type PLACE_TASK and that have not + # been placed. + filtered_placements = filter( + lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK + and p.is_placed(), + placements, + ) + for placement in sorted( + filtered_placements, key=attrgetter("placement_time") + ): + self._placements[placement.task.task_graph].append(placement) + # Schedule the task here since marking it as running requires it to be + # scheduled before. We mark it to be running when we inform the + # framework of the placement. + placement.task.schedule( + time=placement.placement_time, + placement=placement, + ) + + self._logger.info( + "Finished the scheduling cycle initiated at %s.", current_time + ) + + # Check if another run of the Scheduler has been requested, and if so, create + # a task for it. Otherwise, mark the scheduler as not running. + async with self._scheduler_running_lock: + self._scheduler_running = False + if self._rerun_scheduler: + self._rerun_scheduler = False + asyncio.create_task(self.schedule()) + + async def run_scheduler(self) -> None: + """Checks if the scheduler is running, and if not, starts it. + + If the scheduler is already running, we queue up another execution of the + scheduler. This execution batches the scheduling requests, and runs the + scheduler only once for all the requests.""" + async with self._scheduler_running_lock: + if not self._scheduler_running: + asyncio.create_task(self.schedule()) + else: + self._rerun_scheduler = True + + async def RegisterFramework(self, request, context): + """Registers a new framework with the backend scheduler. + This is the entry point for a new instance of Spark / Flink to register + itself with the backend scheduler, and is intended as an EHLO. + """ + if self._initialized: + self._logger.warning( + "Framework already registered at %s with the address %s", + self._initialization_time, + self._master_uri, + ) + return erdos_scheduler_pb2.RegisterFrameworkResponse( + success=False, + message=f"Framework already registered at " + f"{self._initialization_time} at the address {self._master_uri}", + ) + + # Setup a new Framework instance. + framework_name = request.name + self._master_uri = request.uri + self._initialization_time = request.timestamp + self._initialized = True + self._logger.info( + "Registering framework %s with URI %s at %s", + framework_name, + self._master_uri, + self._initialization_time, + ) + + # Setup the simulator types. + parsed_uri = urlparse(self._master_uri) + self._worker_pool = WorkerPool(name=f"WorkerPool_{parsed_uri.netloc}") + self._worker_pools = WorkerPools(worker_pools=[self._worker_pool]) + self._workload = Workload.from_task_graphs({}) + + # Return the response. + return erdos_scheduler_pb2.RegisterFrameworkResponse( + success=True, + message=f"{framework_name} at {self._master_uri} registered successfully!", + ) + + async def RegisterDriver(self, request, context): + if not self._initialized: + self._logger.warning( + "Trying to register a driver with name %s and id %s, " + "but no framework is registered yet.", + request.name, + request.id, + ) + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message="Framework not registered yet.", + worker_id="", + ) + + # Create a Task for the Driver, and add it to the list of drivers. + # TODO (Sukrit): We drop the memory requirements for now, we should use + # them to do multi-dimensional packing using STRL. + self._logger.info( + "Received a request to register a driver with name %s, URI: %s. " + "The driver requires %s cores and %s memory.", + request.id, + request.uri, + request.cores, + request.memory, + ) + driver_resources = Resources( + resource_vector={Resource(name="Slot_CPU", _id="any"): 1} + ) + driver_job = Job( + name=request.id, + profile=WorkProfile( + name=f"WorkProfile_{request.id}", + execution_strategies=ExecutionStrategies( + [ + ExecutionStrategy( + resources=driver_resources, + batch_size=1, + # NOTE (Sukrit): Drivers are long running, and have no + # fixed runtime. Setting it to zero helps us unload the + # driver from the Worker whenever we need it. + runtime=EventTime.zero(), + ) + ] + ), + ), + ) + driver = Task( + name=request.id, + task_graph=request.uri, + job=driver_job, + deadline=EventTime.invalid(), + ) + self._drivers[request.id] = driver + + # Iterate over the Workers and find a Worker that can accomodate the driver. + placement_found = False + for worker in self._worker_pool.workers: + for execution_strategy in driver.available_execution_strategies: + if worker.can_accomodate_strategy(execution_strategy): + # This Worker can accomodate the Driver, we assign it here. + placement_found = True + self._worker_pool.place_task(driver, execution_strategy, worker.id) + + # Update the Task's state and placement information. + placement_time = EventTime(request.timestamp, EventTime.Unit.S) + driver.schedule( + time=placement_time, + placement=Placement( + type=Placement.PlacementType.PLACE_TASK, + computation=driver, + placement_time=placement_time, + worker_pool_id=self._worker_pool.id, + worker_id=worker.id, + strategy=execution_strategy, + ), + ) + driver.start(placement_time) + + # Tell the framework to start the driver. + return erdos_scheduler_pb2.RegisterDriverResponse( + success=True, + message=f"Driver {request.id} registered successfully!", + worker_id=worker.name, + ) + + if not placement_found: + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message=f"No Worker can accomodate the driver {request.id} yet.", + worker_id="", + ) + + async def DeregisterDriver(self, request, context): + if not self._initialized: + self._logger.warning( + "Trying to deregister a driver with id %s, " + "but no framework is registered yet.", + request.id, + ) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=False, message="Framework not registered yet." + ) + + if request.id not in self._drivers: + self._logger.warning( + "Trying to deregister a driver with id %s, " + "but no driver with that id is registered.", + request.id, + ) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=False, + message=f"Driver with id {request.id} not registered yet.", + ) + + # Deregister the driver. + driver = self._drivers[request.id] + completion_time = EventTime(request.timestamp, EventTime.Unit.S) + self._worker_pool.remove_task(completion_time, driver) + driver.finish(completion_time) + del self._drivers[request.id] + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=True, + message=f"Driver with id {request.id} deregistered successfully!", + ) + + async def RegisterTaskGraph(self, request, context): + """Registers a new TaskGraph with the backend scheduler. + This is the entry point for a new application of Spark to register + itself with the backend scheduler, and is intended as an EHLO. + """ + if not self._initialized: + self._logger.warning( + "Trying to register a task graph with ID %s and name %s, " + "but no framework is registered yet.", + request.id, + request.name, + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message="Framework not registered yet.", num_executors=0 + ) + + if request.id in self._workload.task_graphs: + self._logger.warning( + "The application with ID %s and name %s was already registered.", + request.id, + request.name, + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, + message=f"Application ID {request.id} with name {request.name} " + f"already registered!", + num_executors=0, + ) + + self._logger.info( + "Attempting to register application ID %s with name %s", + request.id, + request.name, + ) + # Check if query is from TPC-H workload. + # If yes, retrieve profiled slots and runtime info. If no, use default values + is_tpch_query = False + tpch_query_all_stage_info = None + if request.name.startswith("TPCH_"): + is_tpch_query = True + # retrieve tasks-per-stage and runtime info based on query number + tpch_query_num = request.name.split("TPCH_Q", 1)[1] + tpch_query_all_stage_info = get_all_stage_info_for_query(tpch_query_num) + same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph( + query_num=tpch_query_num, dependencies=request.dependencies + ) + + # return failure message if not tpch app isnt of same DAG structure + if not same_structure: + self._logger.warning( + "TPCH application with ID %s and name %s couldn't be registered." + "DAG structure mismatch!", + request.id, + request.name, + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, + message=f"TPCH application ID {request.id} with name {request.name}" + f" couldn't be registered. DAG structure mismatch!", + num_executors=0, + ) + + # Construct all the Tasks for the TaskGraph. + task_ids_to_task: Mapping[int, Task] = {} + default_resource = Resources( + resource_vector={Resource(name="Slot_CPU", _id="any"): 20} + ) + default_runtime = EventTime(20, EventTime.Unit.US) + + for task_dependency in request.dependencies: + framework_task = task_dependency.key + if is_tpch_query: + mapped_stage_id = stage_id_mapping[framework_task.id] + task_slots = tpch_query_all_stage_info[mapped_stage_id]["num_tasks"] + task_runtime = tpch_query_all_stage_info[mapped_stage_id][ + "avg_task_duration" + ] + self._logger.info( + "Creating Task for given app TPCH stage: %s, mapped to " + "original stage id %s, with tasks: %s and avg runtime: %s", + framework_task.id, + mapped_stage_id, + task_slots, + task_runtime, + ) + task_ids_to_task[framework_task.id] = Task( + name=framework_task.name, + task_graph=request.id, + job=Job( + name=framework_task.name, + profile=WorkProfile( + name=f"WorkProfile_{framework_task.name}", + execution_strategies=ExecutionStrategies( + [ + ExecutionStrategy( + resources=( + default_resource + if not is_tpch_query + else Resources( + resource_vector={ + Resource( + name="Slot_CPU", _id="any" + ): task_slots + } + ) + ), + batch_size=1, + runtime=( + default_runtime + if not is_tpch_query + else EventTime(task_runtime, EventTime.Unit.US) + ), + ) + ] + ), + ), + ), + deadline=EventTime(request.deadline, EventTime.Unit.S), + # TODO (Sukrit): We should maintain a counter for each application + # type so that we can correlate the Tasks with a particular invocation. + timestamp=1, + ) + # NOTE (Sukrit): We maintain the StageID of the Task as a separate field + # that is not accessible / used by the Simulator. + task_ids_to_task[framework_task.id].stage_id = framework_task.id + self._logger.info( + "Constructed Task %s for the TaskGraph %s.", + framework_task.name, + request.id, + ) + + # Construct the TaskGraph from the Tasks. + task_graph_structure: Mapping[Task, Sequence[Task]] = {} + for task_dependency in request.dependencies: + task_graph_structure[task_ids_to_task[task_dependency.key.id]] = [ + task_ids_to_task[task_id] for task_id in task_dependency.children_ids + ] + task_graph = TaskGraph( + name=request.id, + tasks=task_graph_structure, + ) + self._workload.add_task_graph(task_graph) + self._logger.info( + "Added the TaskGraph(name=%s, id=%s) to the Workload.", + request.name, + request.id, + ) + self._logger.info( + "The structure of the TaskGraph %s is \n%s.", + request.id, + str(task_graph), + ) + + # Return the response. + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=True, + message=f"Application ID {request.id} with name " + f"{request.name} and deadline {request.deadline} registered successfully!", + num_executors=FLAGS.initial_executors, + ) + + async def RegisterEnvironmentReady(self, request, context): + """Registers that the environment (i.e., executors) are ready for the given + TaskGraph at the specified time. + + This is intended to release the sources of the TaskGraph to the scheduling + backend, to consider the application in this scheduling cycle. + """ + if not self._initialized: + self._logger.warning( + "Trying to register that the environment is ready for the TaskGraph " + "with ID %s, but no framework is registered yet.", + request.id, + ) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, message="Framework not registered yet." + ) + + task_graph = self._workload.get_task_graph(request.id) + if task_graph is None: + self._logger.warning( + "Trying to register that the environment is ready for the TaskGraph " + "with ID %s, but no TaskGraph with that ID is registered.", + request.id, + ) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, + message=f"TaskGraph with ID {request.id} not registered yet.", + ) + + if request.num_executors != FLAGS.initial_executors: + self._logger.warning( + "The TaskGraph %s requires %s executors, but the environment is ready " + "with %s executors.", + request.id, + FLAGS.initial_executors, + request.num_executors, + ) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, + message=f"Number of executors not {FLAGS.initial_executors}.", + ) + + # Release all the sources of the TaskGraph at the given time. + for source_task in task_graph.get_source_tasks(): + source_task.release(EventTime(request.timestamp, EventTime.Unit.S)) + + # Run the scheduler since the Workload has changed. + await self.run_scheduler() + + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=True, + message=f"Environment ready for TaskGraph with ID {request.id}!", + ) + + async def DeregisterFramework(self, request, context): + """Deregisters the framework with the backend scheduler. + This is the exit point for a running instance of Spark / Flink to deregister""" + if not self._initialized: + self._logger.warning( + "Trying to deregister the framework at %s, " + "but no framework is registered yet.", + request.uri, + ) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, message="Framework not registered yet." + ) + + if not self._master_uri == request.uri: + self._logger.warning( + "Trying to deregister the framework at %s, " + "but the registered framework is at %s.", + request.uri, + self._master_uri, + ) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, + message=f"Framework not registered at {request.uri} yet.", + ) + + # Deregister the framework. + self._initialization_time = None + self._master_uri = None + self._initialized = False + self._logger.info("Deregistering framework at %s", request.uri) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=True, + message=f"Framework at {request.uri} deregistered successfully!", + ) + + async def RegisterWorker(self, request, context): + """Registers a new worker with the backend scheduler.""" + if not self._initialized: + self._logger.warning( + "Trying to register a worker with name %s and id %s, " + "but no framework is registered yet.", + request.name, + request.id, + ) + return erdos_scheduler_pb2.RegisterWorkerResponse( + success=False, message="Framework not registered yet." + ) + + # First, we construct the Resources with the given size. + # TODO (Sukrit): Right now, we drop the memory requirements, we should use + # them to do multi-dimensional packing using STRL. + cpu_resource = Resource(name="Slot_CPU") + worker_resources = Resources(resource_vector={cpu_resource: request.cores}) + self._logger.debug( + "Successfully constructed the resources for the worker %s: %s.", + request.name, + worker_resources, + ) + + # Construct a new Worker instance, and add it to the WorkerPool. + worker = Worker( + name=request.id, + resources=worker_resources, + ) + self._worker_pool.add_workers([worker]) + + self._logger.info( + "Registering worker with name %s, and resources %s.", + worker.name, + worker_resources, + ) + + # Run the scheduler since the Resource set has changed, and new task graphs + # may become eligible to run. + await self.run_scheduler() + + return erdos_scheduler_pb2.RegisterWorkerResponse( + success=True, + message=f"Worker {request.name} registered successfully!", + cores=FLAGS.virtualized_cores, + memory=FLAGS.virtualized_memory * 1024, + ) + + async def NotifyTaskCompletion(self, request, context): + """Notifies the backend scheduler that a task has completed.""" + if not self._initialized: + self._logger.warning( + "Trying to notify the backend scheduler that the task with ID %s " + "from application %s has completed, " + "but no framework is registered yet.", + request.task_id, + request.application_id, + ) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, message="Framework not registered yet." + ) + + task_graph = self._workload.get_task_graph(request.application_id) + if task_graph is None: + self._logger.warning( + "Trying to notify the backend scheduler that the task with ID %s " + "from application %s has completed, but the application " + "was not registered with the backend yet.", + request.task_id, + request.application_id, + ) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=f"Application with ID {request.application_id} " + f"not registered yet.", + ) + + # Find the Task that has completed, and mark it as such. + matched_task = None + for task in task_graph.get_nodes(): + if task.stage_id == request.task_id: + matched_task = task + if matched_task is None: + self._logger.warning( + "Trying to notify the backend scheduler that the task with ID %s " + "from application %s has completed, but the task " + "was not found in the TaskGraph.", + request.task_id, + request.application_id, + ) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=f"Task with ID {request.task_id} " + f"not found in TaskGraph {request.application_id}.", + ) + + # Instead of completing & removing the task immediately, check + # if it is actually complete or will complete in the future + + # Get the actual task completion timestamp + actual_task_completion_time = ( + matched_task.start_time.time + matched_task.remaining_time.time + ) + + current_time = time.time() + self._logger.info( + "Received task for completion at time: %s , task.start_time: %s ," + "task.remaining_time (=runtime): %s , actual completion time: %s ", + round(current_time), + matched_task.start_time.time, + matched_task.remaining_time.time, + actual_task_completion_time, + ) + + # TODO DG: remaining_time assumes execution of the slowest strategy + # Should be updated to reflect correct remaining_time based on chosen strategy? + + # Add all tasks to _tasks_marked_for_completion queue. + # If task has actually completed, it will be dequeued immediately + # Else it will be dequeued at its actual task completion time + self._tasks_marked_for_completion.put( + TimedItem(actual_task_completion_time, matched_task) + ) + + # NOTE: task.finish() and run_scheduler() invocations are postponed + # until it is time for the task to be actually marked as complete. + + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=True, + message=f"Task with ID {request.task_id} marked for completion at " + f"{round(current_time)}! It will be removed on actual " + f"task completion time at {actual_task_completion_time}", + ) + + async def GetPlacements(self, request, context): + """Retrieves the placements applicable at the specified time.""" + request_timestamp = EventTime(request.timestamp, EventTime.Unit.S) + if not self._initialized: + self._logger.warning( + "Trying to get placements for %s at time %s, " + "but no framework is registered yet.", + request.id, + request_timestamp, + ) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, message="Framework not registered yet." + ) + + if request.id not in self._placements: + self._logger.warning( + "Trying to get placements for %s at time %s, but the application " + "was not registered with the backend yet.", + request.id, + request_timestamp, + ) + + # Construct and return the placements., + placements = [] + clip_at = -1 + for index, placement in enumerate(self._placements[request.id]): + if placement.placement_time <= request_timestamp: + clip_at = index + # Mark the Task as RUNNING. + placement.task.start(request_timestamp) + + # resources = placement.execution_strategy.resources + placements.append( + erdos_scheduler_pb2.Placement( + worker_id=placement.worker_id, + application_id=request.id, + task_id=placement.task.stage_id, + cores=1, + ) + ) + self._placements[request.id] = self._placements[request.id][clip_at + 1 :] + self._logger.info( + "Constructed %s placements at time %s for application with ID %s.", + len(placements), + request.timestamp, + request.id, + ) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + placements=placements, + message=f"Constructed {len(placements)} " + f"placements at time {request.timestamp}.", + ) + + # Function to pop tasks from queue based on actual completion time + async def PopTasksBasedOnTime(self): + while True: + if not self._tasks_marked_for_completion.empty(): + # Get the top item from the priority queue + top_item = self._tasks_marked_for_completion._queue[0][1] + + # Check if top item's timestamp is reached or passed by current time + current_time = time.time() + if top_item.timestamp <= current_time: + # Pop the top item + popped_item = self._tasks_marked_for_completion.get() + self._logger.info( + "Removing tasks from pending completion queue: %s at time: %s", + popped_item.task, + current_time, + ) + + # Mark the Task as completed. + # Also release the task from the scheduler service + popped_item.task.update_remaining_time(EventTime.zero()) + popped_item.task.finish( + EventTime(round(current_time), EventTime.Unit.S) + ) + + # Run the scheduler since the Workload has changed. + await self.run_scheduler() + + else: + # If the top item's timestamp hasn't been reached yet, + # sleep for a short duration + await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s + else: + # If the queue is empty, sleep for a short duration + await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s + + +async def serve(): + """Serves the ERDOS Scheduling RPC Server.""" + # Initialize the server. + server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) + erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server( + SchedulerServiceServicer(), server + ) + + # Start the server. + server.add_insecure_port(f"[::]:{FLAGS.port}") + await server.start() + print("Initialized ERDOS Scheduling RPC Server on port", FLAGS.port) + await server.wait_for_termination() + + +def main(argv): + # Create an asyncio event loop + loop = asyncio.get_event_loop() + + # Run the event loop until serve() completes + try: + loop.run_until_complete(serve()) + finally: + loop.close() + + +if __name__ == "__main__": + app.run(main) From eba9a4599e23e7745e7c9a0cb38aded75786be23 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 13:51:14 -0500 Subject: [PATCH 016/128] update rpc proto dir hierarchy to resolve module import issue with sys.path hack --- rpc/README.md | 10 +++++----- rpc/protos/{ => rpc}/erdos_scheduler.proto | 0 rpc/service.py | 5 ++--- 3 files changed, 7 insertions(+), 8 deletions(-) rename rpc/protos/{ => rpc}/erdos_scheduler.proto (100%) diff --git a/rpc/README.md b/rpc/README.md index 294e2287..dcb13dbe 100644 --- a/rpc/README.md +++ b/rpc/README.md @@ -5,26 +5,26 @@ The package provides support for connecting frameworks to the ERDOS Simulator, w This code is being tested with Apache Spark v3.5.0 (with additional instrumentation outlined in [this](https://github.com/dhruvsgarg/spark_mirror/tree/erdos-spark-integration) repository) -To get the RPC service setup, first install the required packages using: +To get the RPC service setup, from the ERDOS root directory, install the required packages using: ```bash -pip install -r requirements.txt +pip install -r rpc/requirements.txt ``` Then, run protoc to generate the service and message definitions using: ```bash -python -m grpc_tools.protoc -I./protos --python_out=./ --grpc_python_out=./ ./protos/erdos_scheduler.proto +python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto ``` and run the service using: ```bash -python service.py +python -m rpc.service ``` You can also find the supported flags by the service, by running ```bash -python service.py --help +python -m rpc.service --help ``` diff --git a/rpc/protos/erdos_scheduler.proto b/rpc/protos/rpc/erdos_scheduler.proto similarity index 100% rename from rpc/protos/erdos_scheduler.proto rename to rpc/protos/rpc/erdos_scheduler.proto diff --git a/rpc/service.py b/rpc/service.py index 2f048067..727e2afb 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,9 +1,8 @@ import asyncio from concurrent import futures - -import erdos_scheduler_pb2 -import erdos_scheduler_pb2_grpc +from rpc import erdos_scheduler_pb2 +from rpc import erdos_scheduler_pb2_grpc import grpc From e6a364a439c8bb53fbd7d4809aef76f71d860e31 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 14:43:33 -0500 Subject: [PATCH 017/128] checkout dhruv's version of service.py --- rpc/service_old.py | 1257 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 1115 insertions(+), 142 deletions(-) diff --git a/rpc/service_old.py b/rpc/service_old.py index 2aaa2dc9..6629ebc7 100644 --- a/rpc/service_old.py +++ b/rpc/service_old.py @@ -1,12 +1,13 @@ import asyncio import heapq +import math import os +import random import sys import time -from collections import defaultdict from concurrent import futures from operator import attrgetter -from typing import Mapping, Sequence +from typing import Dict, Mapping, Sequence from urllib.parse import urlparse sys.path.append( @@ -19,7 +20,7 @@ from absl import app, flags from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph -from schedulers import EDFScheduler, FIFOScheduler +from schedulers import EDFScheduler, FIFOScheduler, TetriSchedScheduler from utils import EventTime, setup_logging from workers import Worker, WorkerPool, WorkerPools from workload import ( @@ -31,6 +32,7 @@ Resources, Task, TaskGraph, + TaskState, Workload, WorkProfile, ) @@ -41,13 +43,21 @@ flags.DEFINE_integer( "max_workers", 10, "Maximum number of workers to use for the RPC server." ) -flags.DEFINE_string("log_file", None, "Path to the log file.", short_name="log") +flags.DEFINE_string("log_file_name", None, "Name of the log file.", short_name="log") flags.DEFINE_string("log_level", "debug", "The level to log.") flags.DEFINE_integer( "initial_executors", 10, "The initial number of executors that are requested by each application.", ) +flags.DEFINE_float( + "spark_task_duration_multiplier", + 1, + "The multiplier used for spark job task runtimes. Buffer time is added " + "to ensure that tasks complete before the scheduler expects it to complete. " + "Completion of tasks after the scheduler's expected task completion time " + "is detrimental for scheduler's planning and could invalidate some schedules", +) flags.DEFINE_integer( "virtualized_cores", 500, @@ -64,15 +74,193 @@ "the framework. Refer to the `virtualized_cores` flag for more information.", ) flags.DEFINE_enum( - "scheduler", "EDF", ["FIFO", "EDF"], "The scheduler to use for this execution." + "scheduler", "DAGSched", ["FIFO", "EDF", "DAGSched"], "The scheduler to use for " + "this execution." +) +flags.DEFINE_enum( + "tpch_profile_type", "Cloudlab", ["Decima", "Cloudlab"], "The set of profiles to " + "use for execution of tpch queries. Note that Cloudlab profile has all 22 queries. " + "From the Decima profile we support only 15 queries (1-10, 12-14, 16, 19). The " + "rest might also run but DAG structure might not match Decima profiles." +) +flags.DEFINE_enum( + "tpch_dataset_size", "50", ["50", "100", "250", "500"], "Options in GB eg. 50g for " + "dataset size of TPCH query. The Cloudlab profile will be picked accordingly. " +) +flags.DEFINE_enum( + "tpch_max_executors_per_job", "50", ["50", "75", "100", "200"], "Options for " + "max executors to use for tpch queries. The Cloudlab profile will be picked " + "accordingly." +) +flags.DEFINE_bool( + "override_worker_cpu_count", + False, + "If True, worker CPU count will be set to INT_MAX. This allows us to scale up " + "spark experiments without actually deploying a large spark cluster.", +) +flags.DEFINE_bool( + "use_profile_to_scale_executors", + False, + "If True, it means that a fixed number of (max) executors was given to the " + "spark job to run. With this profile, we can directly use the profiled " + "stage runtime, while setting the number of required slots or executors " + "to 1 per stage. This allows us do the same scheduling but creates less " + "overhead for this rpc service while running the experiments.", +) +flags.DEFINE_bool( + "release_taskgraphs", + False, + "If True, all tasks from a graph are released if any of the tasks have " + "reached their release time.", +) +flags.DEFINE_bool( + "enforce_deadlines", + False, + "True if the ILP formulation must ensure that deadlines are met.", +) +flags.DEFINE_integer( + "scheduler_time_discretization", + 1, + "The length of each slot in the space-time matrix to consider for scheduling the " + "tasks (in µs). The default value is 1µs, and a higher value can lead to faster " + "solutions but a potentially lower goodput due to resources being blocked for the " + "entirety of the slot.", +) +flags.DEFINE_bool( + "scheduler_enable_optimization_pass", + False, + "If `True`, the scheduler runs pre/post-translation optimization passes" + "when registering STRL expression.", +) +flags.DEFINE_float( + "scheduler_reconsideration_period", + 0.1, + "The percentage of critical path duration until which the scheduler will try " + "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.", +) +flags.DEFINE_bool( + "retract_schedules", False, "Enable the retraction of previously decided schedules." +) +flags.DEFINE_integer( + "scheduler_time_limit", + 3, + "The time limit (in seconds) to allow the scheduler to keep " + "searching for solutions without finding a better one.", +) +flags.DEFINE_bool( + "scheduler_dynamic_discretization", + False, + "If `True`, the scheduler creates space-time matrix non-uniformly. " + "The discretization is dynamically decided based on the occupancy request for " + "each time slice. (default: False)", +) +flags.DEFINE_integer( + "scheduler_max_time_discretization", + 8, + "The maximum discretization that the scheduler can have (in µs). " + "Only used when scheduler_adaptive_discretization flag is enabled. (default: 8)", +) +flags.DEFINE_float( + "scheduler_max_occupancy_threshold", + 0.8, + "The percentage b/w 0 and 1 of maximum occupancy beyond which the discretization " + "would always be 1 incase of dynamic discretization. " + "This flag is only used when dynamic discretization is enabled (default: 0.8)", +) +flags.DEFINE_bool( + "finer_discretization_at_prev_solution", + False, + "If `True`, the scheduler keeps discretization of 1 around previous solution. " + "The discretization is dynamically decided based on the occupancy request for " + "each time slice. (default: False)", +) +flags.DEFINE_integer( + "finer_discretization_window", + 5, + "The window around previous solution that keeps discretization of 1.", +) +flags.DEFINE_bool( + "scheduler_selective_rescheduling", + False, + "If `True`, the supported schedulers will follow some pre-defined strategies for " + "selectively sampling TaskGraphs to reschedule.", +) +flags.DEFINE_integer( + "scheduler_plan_ahead_no_consideration_gap", + 4, + "The length of time gap (in µs) for which the reconsiderations are frozen. " + "From the current time to the consideration gap, any tasks placed will not be " + "reconsidered for rescheduling.", +) +flags.DEFINE_list( + "scheduler_log_times", + [], + "A list of timestamps (in µs) at which to request extra logging from the Scheduler." + "If scheduler_log_to_file is `True`, then extra information will be requested for " + "all timestamps.", +) +flags.DEFINE_integer( + "scheduler_selective_rescheduling_sample_size", + 5, + "If `scheduler_selective_rescheduling` is True, then this flag defines the number " + "of TaskGraphs to sample for rescheduling.", +) +flags.DEFINE_integer( + "min_task_graph_deadline_variance", + 10, + "The MIN percentage (additive) factor to be used with critical path length of the task graph. " + "This helps inform the deadline for the taskgraph and all tasks within the task " + "graph. The value be > 0 since the taskgraph would take atleast the critical path " + "time duration to complete.", +) +flags.DEFINE_integer( + "max_task_graph_deadline_variance", + 25, + "The MAX percentage (additive) factor to be used with critical path length of the task graph. " + "This helps inform the deadline for the taskgraph and all tasks within the task " + "graph. The value be > min_task_graph_deadline_variance since deadline is decided based on it.", +) +flags.DEFINE_bool( + "uniformly_sample_task_slots", + False, + "Enabling this ignores the TPCH profiled taskslots and uses a seeded, rng gerenated " + "num_tasks (= num_slots) for different stages of the TPCH job, uniformly sampled " + "in a range.", +) +flags.DEFINE_integer( + "random_seed", + random.randint(0, sys.maxsize), + "The seed to be used for random number generation. Defaults to a random number.", ) - # Define an item containing completion timestamp and task class TimedItem: + _next_id = 0 + _id_threshold = 99999 + def __init__(self, timestamp, task): self.timestamp = timestamp self.task = task + self.id = TimedItem._next_id + TimedItem._next_id += 1 + + # Reset _next_id if it crosses the threshold + # We keep _next_id bounded to avoid very large numbers + # which could lead to slightly slower comparions + if TimedItem._next_id > TimedItem._id_threshold: + TimedItem._next_id = 0 + + def __lt__(self, other): + """Less than comparison for TimedItem instances.""" + if self.timestamp == other.timestamp: + # Unique ID for each TimedItem acts as tie-breaker + # for inserting into PriorityQueue + return self.id < other.id + return self.timestamp < other.timestamp + + def __eq__(self, other): + """Equality comparison for TimedItem instances.""" + return self.timestamp == other.timestamp and self.id == other.id # Define a priority queue based on heapq module @@ -94,11 +282,17 @@ def empty(self): # Implement the service. class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): def __init__(self) -> None: - """Initialize the service, and setup the logger.""" + """Initialize the service, and setup the logger.""" # Values used by the Servicer. - self._logger = setup_logging(name=FLAGS.log_file, log_level=FLAGS.log_level) + self._logger = setup_logging( + name=__name__, + log_dir=FLAGS.log_dir, + log_file=FLAGS.log_file_name, + log_level=FLAGS.log_level + ) self._initialized = False self._initialization_time = -1 + self._last_step_up_time = EventTime.zero() self._master_uri = None # The simulator types maintained by the Servicer. @@ -107,23 +301,92 @@ def __init__(self) -> None: self._drivers: Mapping[str, Task] = {} self._workload = None + # Track taskgraph completion progress. + self._total_taskgraphs_registered = 0 + self._total_taskgraphs_missed = 0 + self._total_taskgraphs_met = 0 + self._total_taskgraphs_cancelled = 0 + self._cancelled_taskgraphs = set() + self._min_task_graph_deadline_variance = FLAGS.min_task_graph_deadline_variance + self._max_task_graph_deadline_variance = FLAGS.max_task_graph_deadline_variance + + # Setting a rng for future use + self._rng = random.Random(FLAGS.random_seed) + # Scheduler information maintained by the servicer. self._scheduler_running_lock = asyncio.Lock() self._scheduler_running = False self._rerun_scheduler = False + self._scheduler_is_task_type = False if FLAGS.scheduler == "EDF": - self._scheduler = EDFScheduler() + self._scheduler = EDFScheduler( + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) + self._scheduler_is_task_type = True elif FLAGS.scheduler == "FIFO": - self._scheduler = FIFOScheduler() + # NOTE: FIFO is supposed to be run as deadline unaware + self._scheduler = FIFOScheduler( + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) + self._scheduler_is_task_type = True + elif FLAGS.scheduler == "DAGSched": + # --scheduler=TetriSched + # --release_taskgraphs + # --enforce_deadlines + # --scheduler_time_discretization=1 ====> Conv to EventTime & passed through diff arg name + # --scheduler_enable_optimization_pass ====> Passed through _flags + # --retract_schedules + # --scheduler_dynamic_discretization ====> Passed through different argument name + # --scheduler_max_time_discretization=8 ====> Conv to EventTime & passed through diff arg name + # --scheduler_max_occupancy_threshold=0.999 ====> Passed through different argument name + # --finer_discretization_at_prev_solution + # --finer_discretization_window=4 + # --scheduler_selective_rescheduling (DISABLE) ====> Passed through _flags + # --scheduler_reconsideration_period=0.99 ====> Passed through _flags + + self._scheduler = TetriSchedScheduler( + release_taskgraphs=FLAGS.release_taskgraphs, + time_discretization=EventTime( + FLAGS.scheduler_time_discretization, EventTime.Unit.US + ), + _flags=FLAGS, + max_time_discretization=EventTime( + FLAGS.scheduler_max_time_discretization, EventTime.Unit.US + ), + enforce_deadlines=FLAGS.enforce_deadlines, + dynamic_discretization=FLAGS.scheduler_dynamic_discretization, + max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold, + retract_schedules=FLAGS.retract_schedules, + finer_discretization_at_prev_solution=( + FLAGS.finer_discretization_at_prev_solution + ), + finer_discretization_window=EventTime( + FLAGS.finer_discretization_window, EventTime.Unit.US + ), + plan_ahead_no_consideration_gap=EventTime( + FLAGS.scheduler_plan_ahead_no_consideration_gap, EventTime.Unit.US + ), + log_to_file=True, + ) + self._scheduler_is_task_type = not FLAGS.release_taskgraphs else: raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") # Placement information maintained by the servicer. # The placements map the application IDs to the Placement retrieved from the - # scheduler. The placements are automatically clipped at the time of informing - # the framework of applying them to the executors. - # NOTE (Sukrit): This must always be sorted by the Placement time. - self._placements: Mapping[str, Sequence[Placement]] = defaultdict(list) + # scheduler. + # NOTE: (DG) This is a new nested dict implementation. + # First level of dict is a mapping from app-id to all tasks in that app-id + # Second level of dict is a mapping from tasks to exact placement. + # TODO: (DG) This will no longer be ordered by time, so the check needs to be + # done for all tasks? Also, we might need to delete the placement once executed? + self._placements: Dict[str, Dict[str, Placement]] = {} + + # _executed_placements keep a track of previously completed placements since + # placements are deleted after being released. Can be used for debugging. + self._executed_placements: Dict[str, Placement] = {} # Additional task information maintained by the servicer self._tasks_marked_for_completion = PriorityQueue() @@ -135,58 +398,265 @@ def __init__(self) -> None: async def schedule(self) -> None: """Schedules the tasks that have been added to the Workload.""" + current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + async with self._scheduler_running_lock: if self._scheduler_running: self._logger.error( - "Scheduler already running, this should never be reached." + "[%s] Scheduler already running, this should never be reached.", + current_time, ) return self._scheduler_running = True - current_time = EventTime(int(time.time()), EventTime.Unit.S) self._logger.info( - "Starting a scheduling cycle with %s TaskGraphs and %s Workers at %s.", + "[%s] Starting a scheduling cycle with %s TaskGraphs and %s Workers.", + current_time, len(self._workload.task_graphs), len(self._worker_pool.workers), - current_time, ) + # Cumulate the resources from all the WorkerPools + for worker_pool in self._worker_pools.worker_pools: + worker_pool_resources = worker_pool.resources + for resource_name in set( + map(lambda value: value[0].name, worker_pool_resources.resources) + ): + resource = Resource(name=resource_name, _id="any") + self._logger.info( + f"{current_time},WORKER_POOL_UTILIZATION,{worker_pool.id}," + f"{resource_name}," + f"{worker_pool_resources.get_allocated_quantity(resource)}," + f"{worker_pool_resources.get_available_quantity(resource)}" + ) + + # Perform worker pool step + self._logger.info( + "[%s] Need to perform a step before schedule().", + current_time, + ) + completed_tasks = self.PerformWorkerPoolStep(sim_time=current_time) + + # Finish all tasks that have now completed + for completed_task in completed_tasks: + self.CleanupTaskExecution( + task=completed_task, + sim_time=current_time + ) + + # TODO (Sukrit): Change this to a better implementation. # Let's do some simple scheduling for now, that gives a fixed number of # executors to all the available applications in intervals of 10 seconds. - if len(self._workload.task_graphs) >= 2: - placements = self._scheduler.schedule( - sim_time=current_time, + if len(self._workload.task_graphs) >= 1: + scheduler_placements = self._scheduler.schedule( + sim_time=EventTime(current_time.time, EventTime.Unit.US), workload=self._workload, worker_pools=self._worker_pools, ) - # Filter the placements that are not of type PLACE_TASK and that have not - # been placed. + + # Filter the scheduler_placements that are now in CANCEL_TASK state. + cancel_task_placements = list(filter( + lambda p: p.placement_type == Placement.PlacementType.CANCEL_TASK, + scheduler_placements, + )) + self._logger.info( + "[%s] Received %s tasks to be cancelled: %s.", + current_time, + len(cancel_task_placements), + cancel_task_placements, + ) + # Issue task cancellations for identified tasks and taskgraphs so that + # the taskgraphs are no longer in consideration + for placement in cancel_task_placements: + # Update the task placement decision so that we can stop + # responding to RPC calls from its driver based on CANCEL_TASK type + + if placement.task.task_graph not in self._placements: + self._placements[placement.task.task_graph] = {} + self._logger.warning( + "[%s] Came to cancel a placement but taskgraph %s was not in " + "self._placements. Creating an empty dict entry.", + current_time, + placement.task.task_graph, + ) + self._placements[placement.task.task_graph][placement.task] = placement + self._logger.info( + "[%s] Added cancel placement to taskgraph %s for task %s. " + "Placement: %s", + current_time, + placement.task.task_graph, + placement.task, + placement, + ) + + # Since even one task getting cancelled, implies task-graph + # cancellation, we add the task-graph to cancelled set + if placement.task.task_graph not in self._cancelled_taskgraphs: + self._cancelled_taskgraphs.add(placement.task.task_graph) + self._total_taskgraphs_cancelled += 1 + self._logger.info( + "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + current_time, + self._total_taskgraphs_registered, + self._total_taskgraphs_met, + self._total_taskgraphs_missed, + self._total_taskgraphs_cancelled, + ) + + self._logger.info( + "[%s] Cancelling task: %s from taskgraph: %s", + current_time, + placement.task.name, + placement.task.task_graph, + ) + # Sending tasks to cancel. + placement.task.cancel( + time=current_time, + ) + + # Also cancel the task-graph so that all dependent tasks are removed + task_graph = self._workload.get_task_graph(placement.task.task_graph) + if task_graph is None: + self._logger.error("[%s] No TaskGraph found for %s", + current_time, + placement.task.task_graph, + ) + + for cancelled_task in task_graph.cancel(placement.task, current_time): + self._logger.info( + "[%s] Further cancelling dependent task: %s from taskgraph: %s", + current_time, + placement.task.name, + placement.task.task_graph, + ) + + cancelled_task.cancel( + time=current_time, + ) + + # TODO: (DG): Ensure that task-graph is removed from the workload and + # doesn't show up in the next iteration of tetrisched scheduler? + + # Filter the scheduler_placements that are not of type PLACE_TASK and + # have not been placed. filtered_placements = filter( lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK and p.is_placed(), - placements, + scheduler_placements, ) for placement in sorted( filtered_placements, key=attrgetter("placement_time") ): - self._placements[placement.task.task_graph].append(placement) + if placement.task.task_graph not in self._placements: + self._placements[placement.task.task_graph] = {} + self._logger.info( + "[%s] Want to add a placement but taskgraph %s was not in " + "self._placements. Creating an empty dict entry.", + current_time, + placement.task.task_graph, + ) + if placement.task not in self._placements[placement.task.task_graph]: + self._logger.info( + "[%s] Adding new placement to taskgraph %s for task %s. " + "Placement: %s", + current_time, + placement.task.task_graph, + placement.task, + placement, + ) + else: + self._logger.info( + "[%s] Updating an existing placement in taskgraph %s for task %s. " + "Placement: %s", + current_time, + placement.task.task_graph, + placement.task, + placement, + ) + self._placements[placement.task.task_graph][placement.task] = placement + # Schedule the task here since marking it as running requires it to be # scheduled before. We mark it to be running when we inform the # framework of the placement. + + # TODO: (DG) ASK - dont think tasks need to be marked as unscheduled on cancellation? placement.task.schedule( time=placement.placement_time, placement=placement, ) + + # Handle task placements that have returned with unplaced tasks + unplaced_placements = filter( + lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK + and not p.is_placed(), + scheduler_placements, + ) + for placement in unplaced_placements: + if placement.task.task_graph not in self._placements: + self._logger.info( + "[%s] Taskgraph %s not found for task %s, couldn't invalidate " + "it or it was previously invalidated.", + current_time, + placement.task.task_graph, + placement.task, + ) + elif placement.task in self._placements[placement.task.task_graph]: + self._logger.info( + "[%s] Invalidated the placement (taskgraph %s and task %s)" + "from self._placements along with entire taskgraph.", + current_time, + placement.task.task_graph, + placement.task, + ) + for task in self._placements[placement.task.task_graph]: + self._logger.info( + "[%s] Invalidating the placement for task %s " + "from self._placements due to invalidation of %s.", + current_time, + task, + placement.task, + ) + # Unschedule the task + if task.state is TaskState.SCHEDULED: + task.unschedule(time=current_time) + else: + self._logger.warning( + "[%s] Could not unschedule since task %s was " + "found in state %s in during invalidation of %s.", + current_time, + task, + task.state, + placement.task, + ) + # delete the taskgraph at once since we cant change size + # of dict while iterating + del self._placements[placement.task.task_graph] + else: + self._logger.info( + "[%s] Couldn't invalidate placement (taskgraph %s and task %s)." + "It couldnt be found in self._placements.", + current_time, + placement.task.task_graph, + placement.task, + ) + + scheduler_end_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time self._logger.info( - "Finished the scheduling cycle initiated at %s.", current_time + "[%s] Finished the scheduling cycle initiated at %s.", + scheduler_end_time, + current_time, ) # Check if another run of the Scheduler has been requested, and if so, create # a task for it. Otherwise, mark the scheduler as not running. async with self._scheduler_running_lock: self._scheduler_running = False + self._logger.info("[%s] self._rerun_scheduler: %s.", + scheduler_end_time, + self._rerun_scheduler, + ) if self._rerun_scheduler: self._rerun_scheduler = False asyncio.create_task(self.schedule()) @@ -210,31 +680,34 @@ async def RegisterFramework(self, request, context): """ if self._initialized: self._logger.warning( - "Framework already registered at %s with the address %s", + "Framework already registered at %s with the address %s at %s", self._initialization_time, self._master_uri, + self._initialization_time, ) return erdos_scheduler_pb2.RegisterFrameworkResponse( success=False, message=f"Framework already registered at " f"{self._initialization_time} at the address {self._master_uri}", - ) # Setup a new Framework instance. framework_name = request.name self._master_uri = request.uri - self._initialization_time = request.timestamp + self._initialization_time = EventTime(request.timestamp, EventTime.Unit.US) self._initialized = True self._logger.info( - "Registering framework %s with URI %s at %s", + "[%s] Registering framework %s with URI %s.", + self._initialization_time, framework_name, self._master_uri, - self._initialization_time, ) # Setup the simulator types. parsed_uri = urlparse(self._master_uri) - self._worker_pool = WorkerPool(name=f"WorkerPool_{parsed_uri.netloc}") + self._worker_pool = WorkerPool( + name=f"WorkerPool_{parsed_uri.netloc}", + _logger=self._logger + ) self._worker_pools = WorkerPools(worker_pools=[self._worker_pool]) self._workload = Workload.from_task_graphs({}) @@ -245,10 +718,13 @@ async def RegisterFramework(self, request, context): ) async def RegisterDriver(self, request, context): + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to register a driver with name %s and id %s, " + "[%s] Trying to register a driver with name %s and id %s, " "but no framework is registered yet.", + sim_time, request.name, request.id, ) @@ -262,15 +738,17 @@ async def RegisterDriver(self, request, context): # TODO (Sukrit): We drop the memory requirements for now, we should use # them to do multi-dimensional packing using STRL. self._logger.info( - "Received a request to register a driver with name %s, URI: %s. " + "[%s] Received a request to register a driver with name %s, URI: %s. " "The driver requires %s cores and %s memory.", + sim_time, request.id, request.uri, request.cores, request.memory, ) driver_resources = Resources( - resource_vector={Resource(name="Slot_CPU", _id="any"): 1} + resource_vector={Resource(name="Slot_CPU", _id="any"): 1}, + _logger=self._logger, ) driver_job = Job( name=request.id, @@ -295,6 +773,7 @@ async def RegisterDriver(self, request, context): task_graph=request.uri, job=driver_job, deadline=EventTime.invalid(), + _logger=self._logger, ) self._drivers[request.id] = driver @@ -305,10 +784,10 @@ async def RegisterDriver(self, request, context): if worker.can_accomodate_strategy(execution_strategy): # This Worker can accomodate the Driver, we assign it here. placement_found = True - self._worker_pool.place_task(driver, execution_strategy, worker.id) + # self._worker_pool.place_task(driver, execution_strategy, worker.id) # Update the Task's state and placement information. - placement_time = EventTime(request.timestamp, EventTime.Unit.S) + placement_time = sim_time driver.schedule( time=placement_time, placement=Placement( @@ -325,22 +804,25 @@ async def RegisterDriver(self, request, context): # Tell the framework to start the driver. return erdos_scheduler_pb2.RegisterDriverResponse( success=True, - message=f"Driver {request.id} registered successfully!", + message=f"[{sim_time}] Driver {request.id} registered successfully!", worker_id=worker.name, ) if not placement_found: return erdos_scheduler_pb2.RegisterDriverResponse( success=False, - message=f"No Worker can accomodate the driver {request.id} yet.", + message=f"[{sim_time}] No Worker can accomodate the driver {request.id} yet.", worker_id="", ) async def DeregisterDriver(self, request, context): + completion_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to deregister a driver with id %s, " + "[%s] Trying to deregister a driver with id %s, " "but no framework is registered yet.", + completion_time, request.id, ) return erdos_scheduler_pb2.DeregisterDriverResponse( @@ -349,24 +831,24 @@ async def DeregisterDriver(self, request, context): if request.id not in self._drivers: self._logger.warning( - "Trying to deregister a driver with id %s, " + "[%s] Trying to deregister a driver with id %s, " "but no driver with that id is registered.", + completion_time, request.id, ) return erdos_scheduler_pb2.DeregisterDriverResponse( success=False, - message=f"Driver with id {request.id} not registered yet.", + message=f"[{completion_time}] Driver with id {request.id} not registered yet.", ) # Deregister the driver. driver = self._drivers[request.id] - completion_time = EventTime(request.timestamp, EventTime.Unit.S) - self._worker_pool.remove_task(completion_time, driver) + # self._worker_pool.remove_task(completion_time, driver) driver.finish(completion_time) del self._drivers[request.id] return erdos_scheduler_pb2.DeregisterDriverResponse( success=True, - message=f"Driver with id {request.id} deregistered successfully!", + message=f"[{completion_time}] Driver with id {request.id} deregistered successfully!", ) async def RegisterTaskGraph(self, request, context): @@ -374,10 +856,13 @@ async def RegisterTaskGraph(self, request, context): This is the entry point for a new application of Spark to register itself with the backend scheduler, and is intended as an EHLO. """ + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to register a task graph with ID %s and name %s, " + "[%s] Trying to register a task graph with ID %s and name %s, " "but no framework is registered yet.", + sim_time, request.id, request.name, ) @@ -387,19 +872,21 @@ async def RegisterTaskGraph(self, request, context): if request.id in self._workload.task_graphs: self._logger.warning( - "The application with ID %s and name %s was already registered.", + "[%s] The application with ID %s and name %s was already registered.", + sim_time, request.id, request.name, ) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, - message=f"Application ID {request.id} with name {request.name} " - f"already registered!", + message=f"[{sim_time}] Application ID {request.id} with name " + f"{request.name} already registered!", num_executors=0, ) self._logger.info( - "Attempting to register application ID %s with name %s", + "[%s] Attempting to register application ID %s with name %s", + sim_time, request.id, request.name, ) @@ -407,11 +894,34 @@ async def RegisterTaskGraph(self, request, context): # If yes, retrieve profiled slots and runtime info. If no, use default values is_tpch_query = False tpch_query_all_stage_info = None - if request.name.startswith("TPCH_"): + if request.name.startswith("TPCH Query"): is_tpch_query = True - # retrieve tasks-per-stage and runtime info based on query number - tpch_query_num = request.name.split("TPCH_Q", 1)[1] - tpch_query_all_stage_info = get_all_stage_info_for_query(tpch_query_num) + # retrieve tasks-per-stage and runtime info based on query specifications + # Split the string by spaces + query_parts = request.name.split() + + # Initialize dataset_size and max_executor variables with default + tpch_query_num = None + tpch_dataset_size = int(FLAGS.tpch_dataset_size) + tpch_max_executors_per_job = int(FLAGS.tpch_max_executors_per_job) + + # Check if the string has the required format + # Format 1: "TPCH Query " + # Format 2: "TPCH Query " + if len(query_parts) >= 3 and query_parts[0] == "TPCH" and query_parts[1] == "Query": + tpch_query_num = int(query_parts[2]) + + # If dataset size and max cores are provided + if len(query_parts) >= 5: + tpch_dataset_size = int(query_parts[3]) + tpch_max_executors_per_job = int(query_parts[4]) + + tpch_query_all_stage_info = get_all_stage_info_for_query( + query_num=tpch_query_num, + profile_type=FLAGS.tpch_profile_type, + dataset_size=tpch_dataset_size, + max_executors=tpch_max_executors_per_job) + same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph( query_num=tpch_query_num, dependencies=request.dependencies ) @@ -419,14 +929,15 @@ async def RegisterTaskGraph(self, request, context): # return failure message if not tpch app isnt of same DAG structure if not same_structure: self._logger.warning( - "TPCH application with ID %s and name %s couldn't be registered." + "[%s] TPCH application with ID %s and name %s couldn't be registered." "DAG structure mismatch!", + sim_time, request.id, request.name, ) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, - message=f"TPCH application ID {request.id} with name {request.name}" + message=f"[{sim_time}] TPCH application ID {request.id} with name {request.name}" f" couldn't be registered. DAG structure mismatch!", num_executors=0, ) @@ -434,31 +945,96 @@ async def RegisterTaskGraph(self, request, context): # Construct all the Tasks for the TaskGraph. task_ids_to_task: Mapping[int, Task] = {} default_resource = Resources( - resource_vector={Resource(name="Slot_CPU", _id="any"): 20} + resource_vector={Resource(name="Slot_CPU", _id="any"): 20}, + _logger=self._logger, ) - default_runtime = EventTime(20, EventTime.Unit.US) + default_runtime = EventTime( + math.ceil(20 * FLAGS.spark_task_duration_multiplier), + EventTime.Unit.US + ) - for task_dependency in request.dependencies: + for i, task_dependency in enumerate(request.dependencies): framework_task = task_dependency.key if is_tpch_query: mapped_stage_id = stage_id_mapping[framework_task.id] - task_slots = tpch_query_all_stage_info[mapped_stage_id]["num_tasks"] - task_runtime = tpch_query_all_stage_info[mapped_stage_id][ - "avg_task_duration" - ] + + # NOTE: task_slots and task_runtime given to scheduler might be updated + # based on tpch_max_executors_per_job. If task_slots > + # tpch_max_executors_per_job, we transform (task_slots * task_runtime) + # as tpch_max_executors_per_job * ( + # (task_slots * task_runtime)/tpch_max_executors_per_job + # ) + # TODO: (DG) It is not foolproof since scheduler can give more than + # tpch_max_executors_per_job to app if it decides to run multiple + # independent stages in parallel + + profiled_task_slots = ( + tpch_query_all_stage_info[mapped_stage_id]["num_tasks"] + ) + + # Profiled runtime (in ms) * duration_multiplier is converted + # to nearest second + profiled_task_runtime = math.ceil( + ( + tpch_query_all_stage_info[mapped_stage_id] + ["avg_task_duration_ms"]/1000 + ) * FLAGS.spark_task_duration_multiplier + ) + + if FLAGS.uniformly_sample_task_slots: + # Chosen to override profiled tasks slots for TPCH + # TODO: (DG) The (20,60) range is outside default max_executors + # set to 50. Need to update code to correctly use max_executors later + # TODO: (DG) Don't like that seed is now going to change the dag structure + # everytime a new app arrives in the workload. + # Induces variability but seems weird. + # NOTE: tpch_max_ececutors is 50 but we will sample upto 70. + task_slots = self._rng.randint(30, 70) + else: + task_slots = (profiled_task_slots + if profiled_task_slots <= tpch_max_executors_per_job + else tpch_max_executors_per_job + ) + + # TODO: (DG) Adjust runtime if using uniformly_sample_task_slots + # Currently, runtimes still being calculated based on profiled_task_slots + # Setting minimum task_runtime to 8s to allow stages to complete + task_runtime = max(8, ( + profiled_task_runtime + if profiled_task_slots <= tpch_max_executors_per_job + else math.ceil( + (profiled_task_slots * + profiled_task_runtime)/tpch_max_executors_per_job) + ) + ) + if profiled_task_slots > tpch_max_executors_per_job: + self._logger.info( + "[%s] Profiled slots > tpch_max_executors_per_job: %s. Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + sim_time, + tpch_max_executors_per_job, + profiled_task_slots, + profiled_task_runtime, + task_slots, + task_runtime, + ) + self._logger.info( - "Creating Task for given app TPCH stage: %s, mapped to " - "original stage id %s, with tasks: %s and avg runtime: %s", + "[%s] Creating Task for given app TPCH stage: %s, mapped to " + "original stage id %s, with tasks: %s and avg runtime (s): %s. " + "Used multiplier: %s", + sim_time, framework_task.id, mapped_stage_id, task_slots, task_runtime, + FLAGS.spark_task_duration_multiplier, ) task_ids_to_task[framework_task.id] = Task( - name=framework_task.name, + name=f"task_{framework_task.name}_{i}", task_graph=request.id, job=Job( - name=framework_task.name, + name=f"job_{framework_task.name}_{i}", profile=WorkProfile( name=f"WorkProfile_{framework_task.name}", execution_strategies=ExecutionStrategies( @@ -472,7 +1048,8 @@ async def RegisterTaskGraph(self, request, context): Resource( name="Slot_CPU", _id="any" ): task_slots - } + }, + _logger=self._logger, ) ), batch_size=1, @@ -486,16 +1063,21 @@ async def RegisterTaskGraph(self, request, context): ), ), ), - deadline=EventTime(request.deadline, EventTime.Unit.S), + # NOTE: (DG) Removed setting deadline here and will set deadline + # based on taskgraphs critical path instead. + deadline=EventTime.invalid(), + # TODO (Sukrit): We should maintain a counter for each application # type so that we can correlate the Tasks with a particular invocation. timestamp=1, + _logger=self._logger, ) # NOTE (Sukrit): We maintain the StageID of the Task as a separate field # that is not accessible / used by the Simulator. task_ids_to_task[framework_task.id].stage_id = framework_task.id self._logger.info( - "Constructed Task %s for the TaskGraph %s.", + "[%s] Constructed Task %s for the TaskGraph %s.", + sim_time, framework_task.name, request.id, ) @@ -510,23 +1092,78 @@ async def RegisterTaskGraph(self, request, context): name=request.id, tasks=task_graph_structure, ) + + # Calculating critical path time from task graph + critical_path = task_graph.get_longest_path( + weights=lambda task: (task.slowest_execution_strategy.runtime.time) + ) + critical_path_time = ( + sum( + [t.slowest_execution_strategy.runtime for t in critical_path], + start=EventTime.zero(), + ) + .to(EventTime.Unit.US) + .time + ) + + # Setting taskgraph and task deadlines using critical_path_time * deadline_variance_factor + deadline_variance_factor = 1.0 + ( + self._rng.randint( + self._min_task_graph_deadline_variance, + self._max_task_graph_deadline_variance + ) + )/100 + task_graph_slo_time = math.ceil( + critical_path_time * deadline_variance_factor + ) + + for task in task_graph.get_nodes(): + deadline = EventTime(sim_time.time + task_graph_slo_time, + unit=EventTime.Unit.US + ) + task.update_deadline(deadline) + + task_graph.to_dot(f"{request.id}.dot") self._workload.add_task_graph(task_graph) self._logger.info( - "Added the TaskGraph(name=%s, id=%s) to the Workload.", + "[%s] Added the TaskGraph(name=%s, id=%s, deadline=%s, " + "critical_path_time = %s, task_graph_slo_time = %s, " + "deadline_variance_factor= %s) to the Workload.", + sim_time, request.name, request.id, + task_graph.deadline, + critical_path_time, + task_graph_slo_time, + deadline_variance_factor, ) self._logger.info( - "The structure of the TaskGraph %s is \n%s.", + "[%s] The structure of the TaskGraph %s is \n%s.", + sim_time, request.id, str(task_graph), ) + # Increment total number of taskgraphs registered. + self._total_taskgraphs_registered += 1 + + # Show current run statistics. + self._logger.info( + "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + sim_time, + self._total_taskgraphs_registered, + self._total_taskgraphs_met, + self._total_taskgraphs_missed, + self._total_taskgraphs_cancelled, + ) + # Return the response. + # TODO: (DG) Might want to change the number of initial executors if it causes + # issues in scaled up expts return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, - message=f"Application ID {request.id} with name " - f"{request.name} and deadline {request.deadline} registered successfully!", + message=f"[{sim_time}] Application ID {request.id} with name " + f"{request.name} and deadline {task_graph.deadline} registered successfully!", num_executors=FLAGS.initial_executors, ) @@ -537,32 +1174,37 @@ async def RegisterEnvironmentReady(self, request, context): This is intended to release the sources of the TaskGraph to the scheduling backend, to consider the application in this scheduling cycle. """ + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to register that the environment is ready for the TaskGraph " + "[%s] Trying to register that the environment is ready for the TaskGraph " "with ID %s, but no framework is registered yet.", + sim_time, request.id, ) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, message="Framework not registered yet." + success=False, message=f"[{sim_time}] Framework not registered yet." ) - + task_graph = self._workload.get_task_graph(request.id) if task_graph is None: self._logger.warning( - "Trying to register that the environment is ready for the TaskGraph " + "[%s] Trying to register that the environment is ready for the TaskGraph " "with ID %s, but no TaskGraph with that ID is registered.", + sim_time, request.id, ) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=False, - message=f"TaskGraph with ID {request.id} not registered yet.", + message=f"[{sim_time}] TaskGraph with ID {request.id} not registered yet.", ) if request.num_executors != FLAGS.initial_executors: self._logger.warning( - "The TaskGraph %s requires %s executors, but the environment is ready " + "[%s] The TaskGraph %s requires %s executors, but the environment is ready " "with %s executors.", + sim_time, request.id, FLAGS.initial_executors, request.num_executors, @@ -574,71 +1216,88 @@ async def RegisterEnvironmentReady(self, request, context): # Release all the sources of the TaskGraph at the given time. for source_task in task_graph.get_source_tasks(): - source_task.release(EventTime(request.timestamp, EventTime.Unit.S)) + source_task.release(sim_time) + + self._logger.info(f"[{sim_time}] Environment ready for TaskGraph with ID {request.id}!") # Run the scheduler since the Workload has changed. await self.run_scheduler() return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=True, - message=f"Environment ready for TaskGraph with ID {request.id}!", + message=f"[{sim_time}] Environment ready for TaskGraph with ID {request.id}!", ) async def DeregisterFramework(self, request, context): """Deregisters the framework with the backend scheduler. This is the exit point for a running instance of Spark / Flink to deregister""" + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to deregister the framework at %s, " + "[%s] Trying to deregister the framework at %s, " "but no framework is registered yet.", + sim_time, request.uri, ) return erdos_scheduler_pb2.DeregisterFrameworkResponse( - success=False, message="Framework not registered yet." + success=False, message=f"[{sim_time}] Framework not registered yet." ) if not self._master_uri == request.uri: self._logger.warning( - "Trying to deregister the framework at %s, " + "[%s] Trying to deregister the framework at %s, " "but the registered framework is at %s.", + sim_time, request.uri, self._master_uri, ) return erdos_scheduler_pb2.DeregisterFrameworkResponse( success=False, - message=f"Framework not registered at {request.uri} yet.", + message=f"[{sim_time}] Framework not registered at {request.uri} yet.", ) # Deregister the framework. self._initialization_time = None self._master_uri = None self._initialized = False - self._logger.info("Deregistering framework at %s", request.uri) + self._logger.info("[%s] Deregistering framework at %s", sim_time, request.uri) return erdos_scheduler_pb2.DeregisterFrameworkResponse( success=True, - message=f"Framework at {request.uri} deregistered successfully!", + message=f"[{sim_time}] Framework at {request.uri} deregistered successfully!", ) async def RegisterWorker(self, request, context): """Registers a new worker with the backend scheduler.""" + current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to register a worker with name %s and id %s, " + "[%s] Trying to register a worker with name %s and id %s, " "but no framework is registered yet.", + current_time, request.name, request.id, ) return erdos_scheduler_pb2.RegisterWorkerResponse( - success=False, message="Framework not registered yet." + success=False, message=f"[{current_time}] Framework not registered yet." ) # First, we construct the Resources with the given size. # TODO (Sukrit): Right now, we drop the memory requirements, we should use # them to do multi-dimensional packing using STRL. cpu_resource = Resource(name="Slot_CPU") - worker_resources = Resources(resource_vector={cpu_resource: request.cores}) + # TODO: (DG) Override the request.cores to avoid scaling up physical setup + worker_resources = Resources( + resource_vector={ + cpu_resource: request.cores if not FLAGS.override_worker_cpu_count + else 640 + }, + _logger=self._logger, + ) self._logger.debug( - "Successfully constructed the resources for the worker %s: %s.", + "[%s] Successfully constructed the resources for the worker %s: %s.", + current_time, request.name, worker_resources, ) @@ -647,11 +1306,13 @@ async def RegisterWorker(self, request, context): worker = Worker( name=request.id, resources=worker_resources, + _logger=self._logger, ) self._worker_pool.add_workers([worker]) self._logger.info( - "Registering worker with name %s, and resources %s.", + "[%s] Registering worker with name %s, and resources %s.", + current_time, worker.name, worker_resources, ) @@ -662,18 +1323,21 @@ async def RegisterWorker(self, request, context): return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, - message=f"Worker {request.name} registered successfully!", + message=f"[{current_time}] Worker {request.name} registered successfully!", cores=FLAGS.virtualized_cores, memory=FLAGS.virtualized_memory * 1024, ) async def NotifyTaskCompletion(self, request, context): """Notifies the backend scheduler that a task has completed.""" + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " + "[%s] Trying to notify the backend scheduler that the task with ID %s " "from application %s has completed, " "but no framework is registered yet.", + sim_time, request.task_id, request.application_id, ) @@ -684,15 +1348,16 @@ async def NotifyTaskCompletion(self, request, context): task_graph = self._workload.get_task_graph(request.application_id) if task_graph is None: self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " + "[%s] Trying to notify the backend scheduler that the task with ID %s " "from application %s has completed, but the application " "was not registered with the backend yet.", + sim_time, request.task_id, request.application_id, ) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, - message=f"Application with ID {request.application_id} " + message=f"[{sim_time}] Application with ID {request.application_id} " f"not registered yet.", ) @@ -703,15 +1368,16 @@ async def NotifyTaskCompletion(self, request, context): matched_task = task if matched_task is None: self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " + "[%s] Trying to notify the backend scheduler that the task with ID %s " "from application %s has completed, but the task " "was not found in the TaskGraph.", + sim_time, request.task_id, request.application_id, ) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, - message=f"Task with ID {request.task_id} " + message=f"[{sim_time}] Task with ID {request.task_id} " f"not found in TaskGraph {request.application_id}.", ) @@ -719,20 +1385,33 @@ async def NotifyTaskCompletion(self, request, context): # if it is actually complete or will complete in the future # Get the actual task completion timestamp + # actual_task_completion_time = ( + # matched_task.start_time.time + matched_task.remaining_time.time + # ) actual_task_completion_time = ( - matched_task.start_time.time + matched_task.remaining_time.time + sim_time.time + matched_task.remaining_time.time ) - current_time = time.time() self._logger.info( - "Received task for completion at time: %s , task.start_time: %s ," - "task.remaining_time (=runtime): %s , actual completion time: %s ", - round(current_time), + "[%s] Received task for completion. task.start_time: %s ," + "task.remaining_time: %s , actual completion time: %s. " + "Task details: %s", + sim_time.time, matched_task.start_time.time, matched_task.remaining_time.time, actual_task_completion_time, + matched_task, ) + if sim_time.time > actual_task_completion_time: + self._logger.warning( + "[%s] Task exceeded actual completion time by %s, " + "Task details: %s", + sim_time.time, + (sim_time.time - actual_task_completion_time), + matched_task, + ) + # TODO DG: remaining_time assumes execution of the slowest strategy # Should be updated to reflect correct remaining_time based on chosen strategy? @@ -749,19 +1428,20 @@ async def NotifyTaskCompletion(self, request, context): return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=True, message=f"Task with ID {request.task_id} marked for completion at " - f"{round(current_time)}! It will be removed on actual " + f"{sim_time}! It will be removed on actual " f"task completion time at {actual_task_completion_time}", ) async def GetPlacements(self, request, context): """Retrieves the placements applicable at the specified time.""" - request_timestamp = EventTime(request.timestamp, EventTime.Unit.S) + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + if not self._initialized: self._logger.warning( - "Trying to get placements for %s at time %s, " + "[%s] Trying to get placements for %s, " "but no framework is registered yet.", + sim_time, request.id, - request_timestamp, ) return erdos_scheduler_pb2.GetPlacementsResponse( success=False, message="Framework not registered yet." @@ -769,42 +1449,159 @@ async def GetPlacements(self, request, context): if request.id not in self._placements: self._logger.warning( - "Trying to get placements for %s at time %s, but the application " - "was not registered with the backend yet.", + "[%s] Trying to get placements for %s, but the application " + "was not registered with the backend yet OR was cancelled.", + sim_time, request.id, - request_timestamp, ) - + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, + message=f"[{sim_time}] Trying to get placements for " + f"{request.id}, but the application was not registered with the " + f"backend yet OR was cancelled." + ) + # Construct and return the placements., placements = [] - clip_at = -1 - for index, placement in enumerate(self._placements[request.id]): - if placement.placement_time <= request_timestamp: - clip_at = index - # Mark the Task as RUNNING. - placement.task.start(request_timestamp) - - # resources = placement.execution_strategy.resources - placements.append( - erdos_scheduler_pb2.Placement( - worker_id=placement.worker_id, - application_id=request.id, - task_id=placement.task.stage_id, - cores=1, + + # Keep track of app_ids and task_names to delete after placements are issued + to_delete = [] + + for task in self._placements[request.id].keys(): + task_placement = self._placements[request.id][task] + if task.state is TaskState.CANCELLED: + # Task cancelled, add to list to remove from self._placements + to_delete.append((request.id, task)) + else: + if task_placement.placement_time <= sim_time: + # TODO: (DG) Due to small dataset size, each stage automatically gets + # one data partition i.e. one task and one executor. But later for + # large datasets, we might leverage use_profile_to_scale_executors + # to modify the placement before it is sent + self._logger.info( + f"[{sim_time}] Going to set placement.task to run: {task_placement}" ) + + # Mark the Task as RUNNING. + # Right now we don't run task.start() if + # task is already in RUNNING or CANCELLED state. + # Only SCHEDULED -> RUNNING transition is allowed. + if task.state == TaskState.SCHEDULED: + try: + # Initialize the task at the given placement time, + # and place it on the WorkerPool. + worker_pool = self._worker_pools.get_worker_pool( + task_placement.worker_pool_id + ) + assert ( + worker_pool is not None + ), f"No WorkerPool found with ID: {task_placement.worker_pool_id}." + + # Display worker pool utilization before placing task + # Cumulate the resources from all the WorkerPools + for worker_pool in self._worker_pools.worker_pools: + worker_pool_resources = worker_pool.resources + for resource_name in set( + map(lambda value: value[0].name, worker_pool_resources.resources) + ): + resource = Resource(name=resource_name, _id="any") + self._logger.info( + f"{sim_time},WORKER_POOL_UTILIZATION,{worker_pool.id}," + f"{resource_name}," + f"{worker_pool_resources.get_allocated_quantity(resource)}," + f"{worker_pool_resources.get_available_quantity(resource)}" + ) + + # Perform worker pool step + self._logger.info( + "[%s] Need to perform a step before place_task() for %s.", + sim_time, + task, + ) + completed_tasks = self.PerformWorkerPoolStep(sim_time=sim_time) + + # Finish all tasks that have now completed + for completed_task in completed_tasks: + self.CleanupTaskExecution( + task=completed_task, + sim_time=sim_time + ) + + # Place the task on the worker pool + if self._scheduler_is_task_type: + success = True + else: + success = worker_pool.place_task( + task, + execution_strategy=task_placement.execution_strategy, + worker_id=task_placement.worker_id, + ) + if success: + task.start(sim_time) + self._logger.info( + "[%s] Successfully started task: %s on worker_pool: %s", + sim_time, + task, + worker_pool, + ) + # resources = placement.execution_strategy.resources + placements.append( + erdos_scheduler_pb2.Placement( + worker_id=task_placement.worker_id, + application_id=request.id, + task_id=task_placement.task.stage_id, + cores=1, + ) + ) + + # Add to delete list for clearing placement after it has been released + to_delete.append((request.id, task)) + self._logger.debug( + "[%s] Added tuple (%s, %s) to to_delete list.", + sim_time, + request.id, + task, + ) + + # Add task_placement to executed_placements since it is now complete + self._executed_placements[task] = task_placement + else: + self._logger.warning( + "[%s] Could not start task: %s on worker_id: %s and execution strategy: %s", + sim_time, + task, + task_placement.worker_id, + task_placement.execution_strategy, + ) + except ValueError as e: + self._logger.error(f"[{sim_time}] start() errored for task: {task}") + self._logger.error(f"[{sim_time}] Error: {e}") + + # Remove issued placements from self._placements + for app_id, task_name in to_delete: + del self._placements[app_id][task_name] + self._logger.info( + "[%s] Removed placement (app_id=%s, task_name=%s) from self._placements", + sim_time, + app_id, + task_name, ) - self._placements[request.id] = self._placements[request.id][clip_at + 1 :] + self._logger.info( - "Constructed %s placements at time %s for application with ID %s.", + "[%s] Constructed %s placements for application with ID %s.", + sim_time, len(placements), - request.timestamp, request.id, ) + + # Run the scheduler since the Workload has changed. + await self.run_scheduler() + return erdos_scheduler_pb2.GetPlacementsResponse( success=True, placements=placements, - message=f"Constructed {len(placements)} " - f"placements at time {request.timestamp}.", + message=f"[{sim_time}] Constructed {len(placements)} " + f"placements.", ) # Function to pop tasks from queue based on actual completion time @@ -815,26 +1612,113 @@ async def PopTasksBasedOnTime(self): top_item = self._tasks_marked_for_completion._queue[0][1] # Check if top item's timestamp is reached or passed by current time - current_time = time.time() - if top_item.timestamp <= current_time: + current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + if top_item.timestamp <= current_time.time: # Pop the top item popped_item = self._tasks_marked_for_completion.get() self._logger.info( - "Removing tasks from pending completion queue: %s at time: %s", - popped_item.task, + "[%s] Removing task from pending completion queue. " + "Task details: %s. " + "Timestamp: %s", current_time, + popped_item.task, + top_item.timestamp, ) - # Mark the Task as completed. - # Also release the task from the scheduler service - popped_item.task.update_remaining_time(EventTime.zero()) - popped_item.task.finish( - EventTime(round(current_time), EventTime.Unit.S) - ) + # Display worker pool utilization before removing task + # Cumulate the resources from all the WorkerPools + for worker_pool in self._worker_pools.worker_pools: + worker_pool_resources = worker_pool.resources + for resource_name in set( + map(lambda value: value[0].name, worker_pool_resources.resources) + ): + resource = Resource(name=resource_name, _id="any") + self._logger.info( + f"{current_time},WORKER_POOL_UTILIZATION,{worker_pool.id}," + f"{resource_name}," + f"{worker_pool_resources.get_allocated_quantity(resource)}," + f"{worker_pool_resources.get_available_quantity(resource)}" + ) + + # Perform worker pool step + self._logger.info( + "[%s] Need to perform a step before remove_task() for %s.", + current_time, + popped_item.task, + ) + completed_tasks = self.PerformWorkerPoolStep(sim_time=current_time) + # TODO: (DG) For simplicity, we only pop cleanup task state for a single + # popped-item in the loop at once. Later, we could cleanup all identified + # completed tasks here. + + if popped_item.task.state == TaskState.COMPLETED: + # It means that the task state was already cleaned up after another + # invocation of PerformWorkerPoolStep. Can skip here then. + self._logger.info( + "[%s] Task %s already in COMPLETED state while processing " + "in PopTasksBasedOnTime.", + current_time, + popped_item.task, + ) + else: + self._logger.info( + "[%s] PopTasksBasedOnTime invoking CleanupTaskExecution " + "for task %s", + current_time, + popped_item.task, + ) + self.CleanupTaskExecution(task=popped_item.task, + sim_time=current_time) + + # # Free the resources on the worker pool for the completed task + # task_placed_at_worker_pool = self._worker_pools.get_worker_pool( + # popped_item.task.worker_pool_id + # ) + # task_placed_at_worker_pool.remove_task( + # current_time=current_time, task=popped_item.task + # ) + + # # Mark the Task as completed. + # # Also release the task from the scheduler service + # popped_item.task.update_remaining_time(EventTime.zero()) + # popped_item.task.finish(current_time) + + # # TODO: (DG) Check change here + # released_tasks, cancelled_tasks = self._workload.notify_task_completion( + # task=popped_item.task, + # finish_time=current_time) + + # # TODO: (DG) Check change here + # for new_released_task in released_tasks: + # new_released_task.release(current_time) + + # # TODO: Might do for cancelled too + + # # Mark task graph completed + # task_graph = self._workload.get_task_graph(popped_item.task.task_graph) + # if task_graph is None: + # self._logger.error(f"[{current_time}] Taskgraph for task {popped_item.task} is None") + # raise RuntimeError(f"[{current_time}] Taskgraph for task {popped_item.task} is None") + # if task_graph.is_complete(): + # self._logger.info(f"[{current_time}] Finished task_graph {task_graph.name}") + # if task_graph.deadline < current_time: + # self._logger.info(f"[{current_time}] Missed deadline for task_graph {task_graph.name}") + # self._total_taskgraphs_missed += 1 + # else: + # self._logger.info(f"[{current_time}] Met deadline for task_graph {task_graph.name}") + # self._total_taskgraphs_met += 1 + # self._logger.info( + # "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + # current_time, + # self._total_taskgraphs_registered, + # self._total_taskgraphs_met, + # self._total_taskgraphs_missed, + # self._total_taskgraphs_cancelled, + # ) # Run the scheduler since the Workload has changed. await self.run_scheduler() - + else: # If the top item's timestamp hasn't been reached yet, # sleep for a short duration @@ -843,6 +1727,84 @@ async def PopTasksBasedOnTime(self): # If the queue is empty, sleep for a short duration await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s + def PerformWorkerPoolStep(self, sim_time): + # Get time elapsed since last step up time + time_elapsed_since_last_step = ( + sim_time - self._last_step_up_time + ) + + # step up all tasks on the worker-pool to reflect correct remaining time + self._logger.info( + "[%s] Stepping for %s timesteps.", + sim_time, + time_elapsed_since_last_step, + ) + for worker_pool in self._worker_pools.worker_pools: + completed_tasks = worker_pool.step( + self._last_step_up_time, time_elapsed_since_last_step) + for task in completed_tasks: + self._logger.info( + "[%s] Task %s was now found complete.", + sim_time, + task, + ) + + # Update _last_step_up_time + self._last_step_up_time = sim_time + + return completed_tasks + + def CleanupTaskExecution(self, task, sim_time): + self._logger.info( + "[%s] Cleaning up task execution for task %s.", + sim_time, + task, + ) + + # Free the resources on the worker pool for the completed task + task_placed_at_worker_pool = self._worker_pools.get_worker_pool( + task.worker_pool_id + ) + task_placed_at_worker_pool.remove_task( + current_time=sim_time, task=task + ) + + # Mark the Task as completed. + # Also release the task from the scheduler service + task.update_remaining_time(EventTime.zero()) + task.finish(sim_time) + + released_tasks, cancelled_tasks = self._workload.notify_task_completion( + task=task, + finish_time=sim_time) + + for new_released_task in released_tasks: + new_released_task.release(sim_time) + + # TODO: Might do for cancelled too + + # Mark task graph completed + task_graph = self._workload.get_task_graph(task.task_graph) + if task_graph is None: + self._logger.error(f"[{sim_time}] Taskgraph for task {task} is None") + raise RuntimeError(f"[{sim_time}] Taskgraph for task {task} is None") + if task_graph.is_complete(): + self._logger.info(f"[{sim_time}] Finished task_graph {task_graph.name}") + if task_graph.deadline < sim_time: + self._logger.info(f"[{sim_time}] Missed deadline for task_graph {task_graph.name}") + self._total_taskgraphs_missed += 1 + else: + self._logger.info(f"[{sim_time}] Met deadline for task_graph {task_graph.name}") + self._total_taskgraphs_met += 1 + self._logger.info( + "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + sim_time, + self._total_taskgraphs_registered, + self._total_taskgraphs_met, + self._total_taskgraphs_missed, + self._total_taskgraphs_cancelled, + ) + async def serve(): """Serves the ERDOS Scheduling RPC Server.""" @@ -860,6 +1822,17 @@ async def serve(): def main(argv): + # Parse the command-line flags + flags.FLAGS(argv) + + # Access the value of the flag + multiplier = flags.FLAGS.spark_task_duration_multiplier + override_worker_cpus = flags.FLAGS.override_worker_cpu_count + + # Your application logic here + print("Multiplier:", multiplier) + print("Override worker CPUs:", override_worker_cpus) + # Create an asyncio event loop loop = asyncio.get_event_loop() From 69876efbe930d4476a848e18a0d1f353fc9a8fbc Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 15:23:01 -0500 Subject: [PATCH 018/128] make workload_loader optional, make step and get_time_until_next_event public --- simulator.py | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/simulator.py b/simulator.py index 48ffc607..2f32f454 100644 --- a/simulator.py +++ b/simulator.py @@ -223,6 +223,9 @@ def reheapify(self): def __len__(self) -> int: return len(self._event_queue) + def __str__(self) -> str: + return str(self._event_queue) + class Simulator(object): """A `Simulator` simulates the execution of the different tasks in the @@ -251,7 +254,7 @@ def __init__( self, worker_pools: WorkerPools, scheduler: BaseScheduler, - workload_loader: BaseWorkloadLoader, + workload_loader: BaseWorkloadLoader = None, loop_timeout: EventTime = EventTime(time=sys.maxsize, unit=EventTime.Unit.US), scheduler_frequency: EventTime = EventTime(time=-1, unit=EventTime.Unit.US), _flags: Optional["absl.flags"] = None, @@ -259,7 +262,7 @@ def __init__( if not isinstance(scheduler, BaseScheduler): raise ValueError("Scheduler must implement the BaseScheduler interface.") - if not isinstance(workload_loader, BaseWorkloadLoader): + if workload_loader and not isinstance(workload_loader, BaseWorkloadLoader): raise ValueError( "WorkloadLoader must implement the BaseWorkloadLoader interface." ) @@ -392,15 +395,16 @@ def event_representation_filter(record): ) # Second, create the UPDATE_WORKLOAD event to retrieve the latest Workload. - upate_workload_event = Event( - event_type=EventType.UPDATE_WORKLOAD, time=self._simulator_time - ) - self._event_queue.add_event(upate_workload_event) - self._logger.info( - "[%s] Added %s to the event queue.", - self._simulator_time.time, - upate_workload_event, - ) + if self._workload_loader: + upate_workload_event = Event( + event_type=EventType.UPDATE_WORKLOAD, time=self._simulator_time + ) + self._event_queue.add_event(upate_workload_event) + self._logger.info( + "[%s] Added %s to the event queue.", + self._simulator_time.time, + upate_workload_event, + ) # Third, create the SCHEDULER_START event to invoke the scheduler. sched_start_event = Event( @@ -472,7 +476,7 @@ def simulate(self) -> None: """ # Run the simulator loop. while True: - time_until_next_event = self._event_queue.peek().time - self._simulator_time + time_until_next_event = self.time_until_next_event() # If there are any running tasks, step through the execution of the # Simulator until the closest remaining time. @@ -511,6 +515,12 @@ def simulate(self) -> None: if self.__handle_event(self._event_queue.next()): break + def time_until_next_event(self) -> EventTime: + return self._event_queue.peek().time - self._simulator_time + + def step(self, step_size: EventTime) -> None: + self.__step(step_size=step_size) + def __handle_scheduler_start(self, event: Event) -> None: """Handle the SCHEDULER_START event. The method invokes the scheduler, and adds a SCHEDULER_FINISHED event to the event queue. @@ -1503,6 +1513,11 @@ def __handle_update_workload(self, event: Event) -> None: raise ValueError( f"__handle_update_workload called with event of type {event.type}." ) + if not self._workload_loader: + raise ValueError( + "UPDATE_WORKLOAD event enqueued without workload_loader" + ) + updated_workload = self._workload_loader.get_next_workload( current_time=self._simulator_time ) From d68772dce642f664c90b7fac653d6089ff789310 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 15:23:16 -0500 Subject: [PATCH 019/128] implement register/deregister framework --- rpc/service.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 82 insertions(+), 7 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 727e2afb..87d70dd8 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,6 +1,13 @@ import asyncio from concurrent import futures - +from urllib.parse import urlparse + +# TODO: refactor out the need to import main to get common flags +import main +from schedulers import EDFScheduler +from simulator import Simulator, EventTime +from workers import WorkerPool, WorkerPools +from utils import setup_logging, setup_csv_logging from rpc import erdos_scheduler_pb2 from rpc import erdos_scheduler_pb2_grpc @@ -18,13 +25,83 @@ class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): def __init__(self) -> None: - pass + self._logger = setup_logging( + name=__name__, + log_dir=FLAGS.log_dir, + log_file=FLAGS.log_file_name, + log_level=FLAGS.log_level, + ) + self._csv_logger = setup_csv_logging( + name=__name__, + log_dir=FLAGS.log_dir, + log_file=FLAGS.csv_file_name, + ) + for flag_name in FLAGS: + self._csv_logger.debug( + f"input_flag,{flag_name},{getattr(FLAGS, flag_name)}" + ) + + self._master_uri = None + self._initialization_time = None + self._simulator = None + self._scheduler = EDFScheduler() async def RegisterFramework(self, request, context): - pass + if self._simulator: + msg = f"Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterFrameworkResponse( + success=False, + message=msg, + ) + + framework_name = request.name + self._master_uri = request.uri + self._initialization_time = EventTime(request.timestamp, EventTime.Unit.US) + + parsed_uri = urlparse(self._master_uri) + worker_pool = WorkerPool( + name=f"WorkerPool_{parsed_uri.netloc}", + _logger=self._logger, + ) + self._simulator = Simulator( + scheduler=self._scheduler, + worker_pools=WorkerPools([worker_pool]), + ) + + sim_time = ( + EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + ) + msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at {self._initialization_time.time}" + self._logger.info(msg) + return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg) async def DeregisterFramework(self, request, context): - pass + if not self._simulator: + msg = f"Trying to deregister a framework at {request.uri} but no framework has been registered yet." + self._logger.error(msg) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, message=msg + ) + + if self._master_uri != request.uri: + msg = f"Trying to deregister the framework at {request.uri} but the registered framework is at {self._master_uri}" + self._logger.error(msg) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, message=msg + ) + + sim_time = ( + EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + ) + self._initialization_time = None + self._master_uri = None + self._simulator = None + msg = f"[{sim_time}] Successfully deregistered the framework at {request.uri}" + self._logger.info(msg) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=True, message=msg + ) async def RegisterDriver(self, request, context): pass @@ -58,9 +135,7 @@ def main(_argv): loop = asyncio.get_event_loop() server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) - erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server( - Servicer(), server - ) + erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(Servicer(), server) server.add_insecure_port(f"[::]:{FLAGS.port}") try: From 320cb346a67871827140e61bfe604dea7b5f9d56 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 15:30:06 -0500 Subject: [PATCH 020/128] refactor sim time calculation --- rpc/service.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 87d70dd8..050e8730 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,6 +1,8 @@ +import time import asyncio from concurrent import futures from urllib.parse import urlparse +from typing import Optional # TODO: refactor out the need to import main to get common flags import main @@ -46,6 +48,14 @@ def __init__(self) -> None: self._simulator = None self._scheduler = EDFScheduler() + def __sim_time(self, ts: Optional[int] = None): + if self._initialization_time is None: + raise ValueErorr("initialization time is not set") + if not ts: + ts = int(time.time()) + ts = EventTime(ts, EventTime.Unit.US) + return ts - self._initialization_time + async def RegisterFramework(self, request, context): if self._simulator: msg = f"Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" @@ -69,9 +79,7 @@ async def RegisterFramework(self, request, context): worker_pools=WorkerPools([worker_pool]), ) - sim_time = ( - EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time - ) + sim_time = self.__sim_time(request.timestamp) msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at {self._initialization_time.time}" self._logger.info(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg) @@ -91,9 +99,7 @@ async def DeregisterFramework(self, request, context): success=False, message=msg ) - sim_time = ( - EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time - ) + sim_time = self.__sim_time(request.timestamp) self._initialization_time = None self._master_uri = None self._simulator = None From 43b833135adb0a971bc9aa777e5e38dc5ae3f5f5 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 17:26:06 -0500 Subject: [PATCH 021/128] implement RegisterWorker --- rpc/service.py | 85 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 13 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 050e8730..ae77bab8 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -8,7 +8,8 @@ import main from schedulers import EDFScheduler from simulator import Simulator, EventTime -from workers import WorkerPool, WorkerPools +from workers import Worker, WorkerPool, WorkerPools +from workload import Resource, Resources from utils import setup_logging, setup_csv_logging from rpc import erdos_scheduler_pb2 from rpc import erdos_scheduler_pb2_grpc @@ -23,6 +24,21 @@ flags.DEFINE_integer( "max_workers", 10, "Maximum number of workers to use for the RPC server." ) +flags.DEFINE_integer( + "virtualized_cores", + 500, + "The number of virtualized cores that must be created in each Worker on the " + "framework. This allows us to spawn a higher number of executors than the number " + "possible with actual available resources. Thus, we can spawn the executors for " + "each application, and only selectively activate them according to the actual " + "available resources.", +) +flags.DEFINE_integer( + "virtualized_memory", + 500, + "The amount of virtualized memory (in GB) that must be created in each Worker on " + "the framework. Refer to the `virtualized_cores` flag for more information.", +) class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): @@ -48,16 +64,8 @@ def __init__(self) -> None: self._simulator = None self._scheduler = EDFScheduler() - def __sim_time(self, ts: Optional[int] = None): - if self._initialization_time is None: - raise ValueErorr("initialization time is not set") - if not ts: - ts = int(time.time()) - ts = EventTime(ts, EventTime.Unit.US) - return ts - self._initialization_time - async def RegisterFramework(self, request, context): - if self._simulator: + if self.__framework_registered(): msg = f"Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" self._logger.error(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse( @@ -76,7 +84,9 @@ async def RegisterFramework(self, request, context): ) self._simulator = Simulator( scheduler=self._scheduler, - worker_pools=WorkerPools([worker_pool]), + worker_pools=WorkerPools( + [worker_pool] + ), # Maintain only one worker pool in the simulator ) sim_time = self.__sim_time(request.timestamp) @@ -85,7 +95,7 @@ async def RegisterFramework(self, request, context): return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg) async def DeregisterFramework(self, request, context): - if not self._simulator: + if not self.__framework_registered(): msg = f"Trying to deregister a framework at {request.uri} but no framework has been registered yet." self._logger.error(msg) return erdos_scheduler_pb2.DeregisterFrameworkResponse( @@ -122,7 +132,45 @@ async def RegisterEnvironmentReady(self, request, context): pass async def RegisterWorker(self, request, context): - pass + sim_time = self.__sim_time() + + if not self.__framework_registered(): + msg = f"[{sim_time}] Trying to register a worker (id={request.id}, name={request.name}) but no framework is registered yet" + return erdos_scheduler_pb2.RegisterWorkerResponse( + success=False, message=msg + ) + + # TODO(Sukrit): Right now, we drop the memory requirements, we should use + # them to do multi-dimensional packing using STRL. + + cpu_resource = Resource(name="Slot_CPU") + worker_resources = Resources( + resource_vector={ + # TODO(elton): handle override worker cpu count? + cpu_resource: request.cores, + }, + _logger=self._logger, + ) + worker = Worker( + name=request.id, + resources=worker_resources, + _logger=self._logger, + ) + + # Simulator maintains only one worker pool, so this should be fine + next(iter(self._simulator._worker_pools.worker_pools)).add_workers([worker]) + + msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})." + + # TODO(elton): run scheduler? i think not, but keeping a TODO here to review later + + return erdos_scheduler_pb2.RegisterWorkerResponse( + success=True, + message=msg, + # TODO(elton): not sure why we need to set this here + cores=FLAGS.virtualized_cores, + memory=FLAGS.virtualized_memory * 1024, + ) async def GetPlacements(self, request, context): pass @@ -130,6 +178,17 @@ async def GetPlacements(self, request, context): async def NotifyTaskCompletion(self, request, context): pass + def __sim_time(self, ts: Optional[int] = None) -> EventTime: + if self._initialization_time is None: + raise ValueErorr("initialization time is not set") + if not ts: + ts = int(time.time()) + ts = EventTime(ts, EventTime.Unit.US) + return ts - self._initialization_time + + def __framework_registered(self): + return self._simulator is not None + async def serve(server): await server.start() From eccddc58f072ba1f6d18ec4df3e7162bf7e561f0 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 4 Nov 2024 17:29:19 -0500 Subject: [PATCH 022/128] factor out __get_worker_pool --- rpc/service.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index ae77bab8..26762ff4 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -157,8 +157,7 @@ async def RegisterWorker(self, request, context): _logger=self._logger, ) - # Simulator maintains only one worker pool, so this should be fine - next(iter(self._simulator._worker_pools.worker_pools)).add_workers([worker]) + self.__get_worker_pool().add_workers([worker]) msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})." @@ -189,6 +188,12 @@ def __sim_time(self, ts: Optional[int] = None) -> EventTime: def __framework_registered(self): return self._simulator is not None + def __get_worker_pool(self): + # Simulator maintains only one worker pool, so this should be fine + return next(iter(self._simulator._worker_pools.worker_pools)).add_workers( + [worker] + ) + async def serve(server): await server.start() From e94b1f1d83bfc1ee55db0992bdddbd5af5a41802 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 7 Nov 2024 11:43:16 -0500 Subject: [PATCH 023/128] refactor tpch loader --- data/__init__.py | 2 +- data/tpch_loader.py | 416 ++++++++++++++++++++++++++------------------ main.py | 7 +- requirements.txt | 1 + 4 files changed, 250 insertions(+), 176 deletions(-) diff --git a/data/__init__.py b/data/__init__.py index 8c185fa6..9db1ee5b 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -7,7 +7,7 @@ from .task_loader_benchmark import TaskLoaderBenchmark from .task_loader_pylot import TaskLoaderPylot from .task_loader_synthetic import TaskLoaderSynthetic -from .tpch_loader import TpchLoader +from .tpch_loader import TpchWorkloadLoader from .worker_loader import WorkerLoader from .worker_loader_benchmark import WorkerLoaderBenchmark from .workload_loader import WorkloadLoader diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 94962d61..ee5f5404 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -4,12 +4,13 @@ import sys import random -from typing import Any, Dict, List, Optional, Callable +from typing import Any, Dict, List, Optional, Callable, Tuple from pathlib import Path import absl import numpy as np import yaml +import networkx as nx from utils import EventTime, setup_logging from workload import ( @@ -27,15 +28,247 @@ from .base_workload_loader import BaseWorkloadLoader -class TpchLoader(BaseWorkloadLoader): - """Loads the TPCH trace from the provided file +class TpchLoader: + """Construct TPC-H task graph from a query profile Args: path (`str`): Path to a YAML file specifying the TPC-H query DAGs flags (`absl.flags`): The flags used to initialize the app, if any + """ - def __init__(self, path: str, flags: "absl.flags") -> None: + def __init__(self, path: Path, flags: "absl.flags"): + self._logger = setup_logging( + name=self.__class__.__name__, + log_dir=flags.log_dir, + log_file=flags.log_file_name, + log_level=flags.log_level, + ) + self._flags = flags + + # Load the TPC-H DAG structures + with open(path, "r") as f: + workload_data = yaml.safe_load(f) + self._graphs = {} + for query in workload_data["graphs"]: + query_num = int(query["name"][1:]) + self._graphs[query_num] = query["graph"] + + def make_task_graph( + self, + id: str, + query_num: int, + release_time: EventTime, + dependencies: Optional[List[Dict[str, Any]]] = None, + profile_type: Optional[str] = None, + dataset_size: Optional[int] = None, + max_executors_per_job: Optional[int] = None, + min_task_runtime: Optional[int] = None, + ) -> Tuple[TaskGraph, Dict[int, int]]: + if profile_type is None: + profile_type = self._flags.tpch_profile_type + if dataset_size is None: + dataset_size = self._flags.tpch_dataset_size + if max_executors_per_job is None: + max_executors_per_job = self._flags.tpch_max_executors_per_job + if min_task_runtime is None: + min_task_runtime = self._flags.tpch_min_task_runtime + + query_name = f"Q{query_num}" + + # Normalize dependencies + if dependencies is None: + dependencies = self._graphs[query_num] + deps_mapping = None + else: + deps_mapping = self.__map_dependencies(query_num, dependencies) + for node in dependencies: + node["name"] = deps_mapping[node["name"]] + if "children" in node: + node["children"] = [deps_mapping[c] for c in node["children"]] + self._logger.info( + f"Mapped dependencies for TPC-H query {query_name} as {deps_mapping}." + ) + + # Construct a JobGraph + job_graph = JobGraph(name=f"{query_name}[{id}]") + profiler_data = get_all_stage_info_for_query( + query_num, + profile_type, + dataset_size, + max_executors_per_job, + ) + name_to_job = {} + for node in dependencies: + worker_profile = self.__make_work_profile( + profiler_data=profiler_data, + query_name=query_name, + node_name=node["name"], + max_executors_per_job=max_executors_per_job, + min_task_runtime=min_task_runtime, + ) + job = Job( + name=node["name"], + profile=worker_profile, + ) + name_to_job[node["name"]] = job + job_graph.add_job(job=job) + for node in dependencies: + job = name_to_job[node["name"]] + if "children" in node: + for child in node["children"]: + if child not in name_to_job: + raise ValueError( + f"Child {child} of {node['name']} was " + f"not present in the graph." + ) + child_job = name_to_job[child] + job_graph.add_child(job, child_job) + + # Construct TaskGraph from JobGraph + task_graph = job_graph.get_next_task_graph( + start_time=release_time, + _flags=self._flags, + ) + + self._logger.info(f"Constructed TaskGraph for TPC-H query {query_name}.") + + return task_graph, deps_mapping + + def __make_work_profile( + self, + profiler_data: Dict[int, Dict[str, Any]], + query_name: str, + node_name: str, + max_executors_per_job: int, + min_task_runtime: int, + ) -> WorkProfile: + profile = profiler_data[int(node_name)] + + profiled_task_slots = profile["num_tasks"] + profiled_runtime = math.ceil(profile["avg_task_duration_ms"] / 1e3) + + if profiled_task_slots > max_executors_per_job: + num_slots = max_executors_per_job + runtime = math.ceil( + (profiled_task_slots * profiled_runtime) / max_executors_per_job + ) + self._logger.debug( + "%s@%s: num_slots (%s) > max_executors_per_job (%s). Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + node_name, + query_name, + profiled_task_slots, + max_executors_per_job, + profiled_task_slots, + profiled_runtime, + num_slots, + runtime, + ) + else: + num_slots = profiled_task_slots + runtime = profiled_runtime + + if runtime < min_task_runtime: + _runtime = runtime + runtime = max(min_task_runtime, _runtime) + self._logger.debug( + "%s@%s: runtime (%s) < min_task_runtime (%s). Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + node_name, + query_name, + _runtime, + min_task_runtime, + num_slots, + _runtime, + num_slots, + runtime, + ) + + resources = Resources( + resource_vector={ + Resource(name="Slot", _id="any"): num_slots, + }, + ) + execution_strategies = ExecutionStrategies() + execution_strategies.add_strategy( + strategy=ExecutionStrategy( + resources=resources, + batch_size=1, + runtime=EventTime(runtime, EventTime.Unit.US), + ), + ) + return WorkProfile( + name=f"{query_name}_{node_name}_execution_profile", + execution_strategies=execution_strategies, + ) + + def __map_dependencies(self, query_num: int, deps: List[Dict[str, Any]]): + def deps_to_nx_graph(deps: List[Dict[str, Any]]): + query_dependency = [] + for node in deps: + if "children" in node: + for child in node["children"]: + query_dependency.append((node["name"], child)) + else: + # Ensure each tuple has two elements by adding a dummy node + query_dependency.append((node["name"], None)) + + # Remove any tuples where the second element is None + query_dependency = [ + edge for edge in query_dependency if edge[1] is not None + ] + + # convert job structure into a nx graph + nx_deps = nx.DiGraph(query_dependency) + + return nx_deps + + def are_structurally_same(graph1, graph2): + # Step 1: Check if both graphs have the same number of vertices + if len(graph1.nodes) != len(graph2.nodes): + return False, None + + # Step 2: Check if there exists a bijection between the vertices + # of the two graphs such that their adjacency relationships match + for mapping in nx.isomorphism.GraphMatcher( + graph1, graph2 + ).isomorphisms_iter(): + # Check if the adjacency relationships match + if all(v in mapping for u, v in graph1.edges): + # graph structures match + # mapping is a dict {key=original-stage-id, val=app-stage-id} + # we reverse reversed mapping from app-stage-id to orig-stage-id + reversed_mapping = {v: k for k, v in mapping.items()} + return True, reversed_mapping + + return False, None + + base_deps = self._graphs[query_num] + is_same, mapping = are_structurally_same( + deps_to_nx_graph(base_deps), deps_to_nx_graph(deps) + ) + + if not is_same: + raise ValueError( + f"Structure of dependencies provided for query number {query_num} does not match that of canonical dependencies. Provided: {deps}. Canonical: {base_deps}" + ) + + return mapping + + @property + def num_queries(self) -> int: + return len(self._graphs) + + +class TpchWorkloadLoader(BaseWorkloadLoader): + """Construct a TPC-H query workload + + Args: + flags (`absl.flags`): The flags used to initialize the app, if any + """ + + def __init__(self, flags: "absl.flags") -> None: self._flags = flags self._logger = setup_logging( name=self.__class__.__name__, @@ -50,29 +283,18 @@ def __init__(self, path: str, flags: "absl.flags") -> None: else: self._workload_update_interval = EventTime(sys.maxsize, EventTime.Unit.US) - # Set up task graph generators - with open(path, "r") as f: - workload_data = yaml.safe_load(f) - task_graph_generators = {} - for query in workload_data["graphs"]: - query_name = query["name"] - graph = query["graph"] - gen = self.make_task_graph_generator( - query_name=query_name, - graph=graph, - ) - task_graph_generators[query_name] = gen - self._task_graph_generators = task_graph_generators + # Instantiate tpch loader + self._tpch_loader = TpchLoader(path=flags.tpch_query_dag_spec, flags=flags) # Gather release times - release_policy = self._make_release_policy() + release_policy = self.__make_release_policy() release_times = release_policy.get_release_times( completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) ) # Sample queries to be released query_nums = [ - self._rng.randint(1, len(self._task_graph_generators)) + self._rng.randint(1, self._tpch_loader.num_queries) for _ in range(self._flags.override_num_invocation) ] @@ -82,7 +304,7 @@ def __init__(self, path: str, flags: "absl.flags") -> None: # Initialize workload self._workload = Workload.empty(flags) - def _make_release_policy(self): + def __make_release_policy(self): release_policy_args = {} if self._flags.override_release_policy == "periodic": release_policy_args = { @@ -131,151 +353,6 @@ def _make_release_policy(self): ), ) - def make_task_graph_generator( - self, - query_name: str, - graph: List[Dict[str, Any]], - ) -> Callable[[int, EventTime, EventTime], TaskGraph]: - def h(idx: int, current_time: EventTime, start_time: EventTime): - # Construct a JobGraph - job_graph = JobGraph(name=f"{query_name}[{idx}]") - query_num = int(query_name[1:]) - profiler_data = get_all_stage_info_for_query( - query_num, - self._flags.tpch_profile_type, - self._flags.tpch_dataset_size, - self._flags.tpch_max_executors_per_job, - ) - name_to_job = {} - for node in graph: - worker_profile = self.make_work_profile( - profiler_data=profiler_data, - query_name=query_name, - node_name=node["name"], - ) - job = Job( - name=node["name"], - profile=worker_profile, - ) - name_to_job[node["name"]] = job - job_graph.add_job(job=job) - for node in graph: - job = name_to_job[node["name"]] - if "children" in node: - for child in node["children"]: - if child not in name_to_job: - raise ValueError( - f"Child {child} of {node['name']} was " - f"not present in the graph." - ) - child_job = name_to_job[child] - job_graph.add_child(job, child_job) - - # Construct TaskGraph from JobGraph - task_graph = job_graph.get_next_task_graph( - start_time=start_time, - _flags=self._flags, - ) - - # Update deadline - critical_path = task_graph.get_longest_path( - weights=lambda task: (task.slowest_execution_strategy.runtime.time) - ) - critical_path_time = ( - sum( - [t.slowest_execution_strategy.runtime for t in critical_path], - start=EventTime.zero(), - ) - .to(EventTime.Unit.US) - .time - ) - deadline_variance_factor = ( - 1.0 - + ( - self._rng.randint( - self._flags.min_deadline_variance, - self._flags.max_deadline_variance, - ) - ) - / 100 - ) - task_graph_slo_time = math.ceil( - critical_path_time * deadline_variance_factor - ) - for task in task_graph.get_nodes(): - deadline = EventTime( - start_time.time + task_graph_slo_time, unit=EventTime.Unit.US - ) - task.update_deadline(deadline) - - return task_graph - - return h - - def make_work_profile( - self, profiler_data: Dict[int, Dict[str, Any]], query_name: str, node_name: str - ) -> WorkProfile: - profile = profiler_data[int(node_name)] - - profiled_task_slots = profile["num_tasks"] - profiled_runtime = math.ceil(profile["avg_task_duration_ms"] / 1e3) - - if profiled_task_slots > self._flags.tpch_max_executors_per_job: - num_slots = self._flags.tpch_max_executors_per_job - runtime = math.ceil( - (profiled_task_slots * profiled_runtime) - / self._flags.tpch_max_executors_per_job - ) - self._logger.debug( - "%s@%s: num_slots (%s) > tpch_max_executors_per_job (%s). Converted " - "(slots,runtime) from (%s,%s) to (%s, %s)", - node_name, - query_name, - profiled_task_slots, - self._flags.tpch_max_executors_per_job, - profiled_task_slots, - profiled_runtime, - num_slots, - runtime, - ) - else: - num_slots = profiled_task_slots - runtime = profiled_runtime - - if runtime < self._flags.tpch_min_task_runtime: - _runtime = runtime - runtime = max(self._flags.tpch_min_task_runtime, _runtime) - self._logger.debug( - "%s@%s: runtime (%s) < tpch_min_task_runtime (%s). Converted " - "(slots,runtime) from (%s,%s) to (%s, %s)", - node_name, - query_name, - _runtime, - self._flags.tpch_min_task_runtime, - num_slots, - _runtime, - num_slots, - runtime, - ) - - resources = Resources( - resource_vector={ - Resource(name="Slot", _id="any"): num_slots, - }, - ) - execution_strategies = ExecutionStrategies() - execution_strategies.add_strategy( - strategy=ExecutionStrategy( - resources=resources, - batch_size=1, - runtime=EventTime(runtime, EventTime.Unit.US), - ), - ) - return WorkProfile( - name=f"{query_name}_{node_name}_execution_profile", - execution_strategies=execution_strategies, - ) - def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: # Reset rng if this is the first workload. This is to ensure we have # parity with how jobs are spawned in Spark @@ -301,11 +378,10 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: return None for i, (q, t) in enumerate(to_release): - query_name = f"Q{q}" - task_graph = self._task_graph_generators[query_name]( - idx=i, - current_time=current_time, - start_time=t, + task_graph, _ = self._tpch_loader.make_task_graph( + id=str(i), + query_num=q, + release_time=t, ) self._workload.add_task_graph(task_graph) diff --git a/main.py b/main.py index 039dbbe5..2c42e338 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ TaskLoaderBenchmark, TaskLoaderPylot, TaskLoaderSynthetic, - TpchLoader, + TpchWorkloadLoader, WorkerLoader, WorkerLoaderBenchmark, WorkloadLoader, @@ -669,10 +669,7 @@ def main(args): flags=FLAGS, ) elif FLAGS.replay_trace == "tpch": - workload_loader = TpchLoader( - path=FLAGS.tpch_query_dag_spec, - flags=FLAGS, - ) + workload_loader = TpchWorkloadLoader(flags=FLAGS) else: raise NotImplementedError( f"Replay trace {FLAGS.replay_trace} is not implemented yet." diff --git a/requirements.txt b/requirements.txt index f3e8957c..4be1c543 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ cplex pre-commit black isort +networkx From ed510ab601b46b2d1bf79f79f40b90769da0ca84 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 7 Nov 2024 14:25:29 -0500 Subject: [PATCH 024/128] implement register task graph --- rpc/service.py | 129 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 114 insertions(+), 15 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 26762ff4..f4b850b0 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -3,6 +3,7 @@ from concurrent import futures from urllib.parse import urlparse from typing import Optional +from enum import Enum # TODO: refactor out the need to import main to get common flags import main @@ -10,6 +11,7 @@ from simulator import Simulator, EventTime from workers import Worker, WorkerPool, WorkerPools from workload import Resource, Resources +from data.tpch_loader import TpchLoader from utils import setup_logging, setup_csv_logging from rpc import erdos_scheduler_pb2 from rpc import erdos_scheduler_pb2_grpc @@ -39,9 +41,19 @@ "The amount of virtualized memory (in GB) that must be created in each Worker on " "the framework. Refer to the `virtualized_cores` flag for more information.", ) +flags.DEFINE_integer( + "spark_app_num_initial_executors", + 10, + "The initial number of executors that are requested by each Spark application.", +) + + +class DataLoader(Enum): + TPCH = "tpch" class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): + def __init__(self) -> None: self._logger = setup_logging( name=__name__, @@ -61,21 +73,33 @@ def __init__(self) -> None: self._master_uri = None self._initialization_time = None + self._data_loaders = {} + # TODO: refactor + self._data_loaders[DataLoader.TPCH] = TpchLoader( + path=FLAGS.tpch_query_dag_spec, flags=FLAGS + ) self._simulator = None self._scheduler = EDFScheduler() + self._registered_task_graphs = {} + async def RegisterFramework(self, request, context): + sim_time = self.__sim_time() + if self.__framework_registered(): - msg = f"Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" + msg = f"[{sim_time}] Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" self._logger.error(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse( success=False, message=msg, ) + t = int(time.time()) framework_name = request.name self._master_uri = request.uri - self._initialization_time = EventTime(request.timestamp, EventTime.Unit.US) + self._initialization_time = EventTime(t, EventTime.Unit.US) + # Update sim_time now that initialization_time is set + sim_time = self.__sim_time() parsed_uri = urlparse(self._master_uri) worker_pool = WorkerPool( @@ -89,12 +113,13 @@ async def RegisterFramework(self, request, context): ), # Maintain only one worker pool in the simulator ) - sim_time = self.__sim_time(request.timestamp) - msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at {self._initialization_time.time}" + msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}" self._logger.info(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg) async def DeregisterFramework(self, request, context): + sim_time = self.__sim_time() + if not self.__framework_registered(): msg = f"Trying to deregister a framework at {request.uri} but no framework has been registered yet." self._logger.error(msg) @@ -109,7 +134,6 @@ async def DeregisterFramework(self, request, context): success=False, message=msg ) - sim_time = self.__sim_time(request.timestamp) self._initialization_time = None self._master_uri = None self._simulator = None @@ -126,9 +150,89 @@ async def DeregisterDriver(self, request, context): pass async def RegisterTaskGraph(self, request, context): - pass + sim_time = self.__sim_time() + + if not self.__framework_registered(): + msg = f"[{sim_time}] Trying to register a task graph (id={request.id}, name={request.name}) but no framework has been registered yet." + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + + if request.id in self._registered_task_graphs: + msg = f"[{sim_time}] The task graph (id={request.id}, name={request.name}) is already registered" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + + # We only support TPCH queries for now + if request.name.startswith("TPCH Query"): + # Parse request name + query_parts = request.name.split() + if len(query_parts) != 3 and len(query_parts) != 5: + msg = f"[{sim_time}] Invalid TPCH query request" + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + query_num = int(query_parts[2]) + if len(query_parts) == 5: + dataset_size = int(query_parts[3]) + max_executors_per_job = int(query_parts[4]) + else: + dataset_size = FLAGS.tpch_dataset_size + max_executors_per_job = FLAGS.tpch_max_executors_per_job + + # Convert request.dependencies to [{name: int, children: [int]}] + dependencies = [] + for dep in request.dependencies: + dependencies.append( + { + "name": int(dep.key.id), + "children": [int(c) for c in dep.children_ids], + } + ) + + # Construct the task graph + try: + task_graph, stage_id_mapping = self._data_loaders[ + DataLoader.TPCH + ].make_task_graph( + id=id, + query_num=query_num, + release_time=sim_time, + dependencies=dependencies, + dataset_size=dataset_size, + max_executors_per_job=max_executors_per_job, + ) + except Exception as e: + msg = f"[{sim_time}] Failed to load TPCH query {query_num}. Exception: {e}" + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + else: + msg = f"[{sim_time}] The service only supports TPCH queries" + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + + self._registered_task_graphs[request.id] = (task_graph, stage_id_mapping) + msg = f"[{sim_time}] Registered task graph (id={request.id}, name={request.name}) successfully" + + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=True, + message=msg, + num_executors=FLAGS.spark_app_num_initial_executors, + ) async def RegisterEnvironmentReady(self, request, context): + if not self.__framework_registered(): + msg = f"[{sim_time}] Trying to notify that the environment is ready for task graph (id={request.id}, name={request.name}) but no framework is registered yet" + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, + message=msg, + ) + pass async def RegisterWorker(self, request, context): @@ -161,8 +265,6 @@ async def RegisterWorker(self, request, context): msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})." - # TODO(elton): run scheduler? i think not, but keeping a TODO here to review later - return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, message=msg, @@ -177,11 +279,10 @@ async def GetPlacements(self, request, context): async def NotifyTaskCompletion(self, request, context): pass - def __sim_time(self, ts: Optional[int] = None) -> EventTime: + def __sim_time(self) -> EventTime: if self._initialization_time is None: - raise ValueErorr("initialization time is not set") - if not ts: - ts = int(time.time()) + return EventTime.invalid() + ts = int(time.time()) ts = EventTime(ts, EventTime.Unit.US) return ts - self._initialization_time @@ -190,9 +291,7 @@ def __framework_registered(self): def __get_worker_pool(self): # Simulator maintains only one worker pool, so this should be fine - return next(iter(self._simulator._worker_pools.worker_pools)).add_workers( - [worker] - ) + return next(iter(self._simulator._worker_pools.worker_pools)) async def serve(server): From 81c43074bfca3b3e50b518c5927011ce62d1da7b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 7 Nov 2024 21:57:50 -0500 Subject: [PATCH 025/128] add testing for service --- rpc/service.py | 6 +-- tests/test_service.py | 97 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 tests/test_service.py diff --git a/rpc/service.py b/rpc/service.py index f4b850b0..9b346c49 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -226,6 +226,8 @@ async def RegisterTaskGraph(self, request, context): ) async def RegisterEnvironmentReady(self, request, context): + sim_time = self.__sim_time() + if not self.__framework_registered(): msg = f"[{sim_time}] Trying to notify that the environment is ready for task graph (id={request.id}, name={request.name}) but no framework is registered yet" return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( @@ -233,8 +235,6 @@ async def RegisterEnvironmentReady(self, request, context): message=msg, ) - pass - async def RegisterWorker(self, request, context): sim_time = self.__sim_time() @@ -263,7 +263,7 @@ async def RegisterWorker(self, request, context): self.__get_worker_pool().add_workers([worker]) - msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})." + msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})" return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, diff --git a/tests/test_service.py b/tests/test_service.py new file mode 100644 index 00000000..2f4ff84f --- /dev/null +++ b/tests/test_service.py @@ -0,0 +1,97 @@ +import re +import time +import subprocess + +import pytest +import grpc +from rpc import erdos_scheduler_pb2 +from rpc import erdos_scheduler_pb2_grpc + + +@pytest.fixture(scope="module", autouse=True) +def service(): + process = subprocess.Popen(["python", "-m", "rpc.service"]) + channel = grpc.insecure_channel("localhost:50051") + try: + grpc.channel_ready_future(channel).result(timeout=5) + yield process + finally: + channel.close() + process.kill() + + +def test_service(): + channel = grpc.insecure_channel("localhost:50051") + stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel) + + # Register a framework + request = erdos_scheduler_pb2.RegisterFrameworkRequest( + name="test_framework", uri="http://localhost/test", timestamp=1234567890 + ) + response = stub.RegisterFramework(request) + assert response.success and re.search( + r"Registered the framework 'test_framework' with URI http://localhost/test at UNIX time", + response.message, + ) + + # Register a worker + request = erdos_scheduler_pb2.RegisterWorkerRequest( + name="test_worker", + id="1234", + cores=100, + memory=1024, + ) + response = stub.RegisterWorker(request) + assert response.success and re.search( + r"Registered worker \(id=1234, name=test_worker\)", response.message + ) + + # Register an incorrect TaskGraph + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph", + name="TPCH Query 4 50 50", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [1, 2]}, + ], + ) + response = stub.RegisterTaskGraph(request) + assert not response.success and re.search( + r"Failed to load TPCH query 4. Exception: Structure of dependencies provided for query number 4 does not match that of canonical dependencies", + response.message, + ) + + # Register a correct TaskGraph + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph", + name="TPCH Query 4 50 50", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]}, + {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]}, + {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]}, + {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]}, + {"key": {"id": 4, "name": "stage 4"}, "children_ids": []}, + ], + ) + response = stub.RegisterTaskGraph(request) + assert ( + response.success + and re.search( + r"Registered task graph \(id=task-graph, name=TPCH Query 4 50 50\) successfully", + response.message, + ) + and response.num_executors == 10 + ) + + # Deregister framework + request = erdos_scheduler_pb2.DeregisterFrameworkRequest( + name="test_framework", uri="http://localhost/test", timestamp=1234567890 + ) + response = stub.DeregisterFramework(request) + assert response.success and re.search( + r"Successfully deregistered the framework at http://localhost/test", + response.message, + ) + + channel.close() From e1faa7f4b1d1c3e29dc932b524f840ece88026ed Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 7 Nov 2024 22:32:19 -0500 Subject: [PATCH 026/128] implement register environment ready --- rpc/service.py | 50 +++++++++++++++++++++++++++++++++++++------ tests/test_service.py | 16 ++++++++++++-- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 9b346c49..e276b9e0 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,6 +1,7 @@ import time import asyncio from concurrent import futures +from collections import namedtuple from urllib.parse import urlparse from typing import Optional from enum import Enum @@ -8,9 +9,10 @@ # TODO: refactor out the need to import main to get common flags import main from schedulers import EDFScheduler -from simulator import Simulator, EventTime +from simulator import Simulator, Event, EventTime, EventType from workers import Worker, WorkerPool, WorkerPools -from workload import Resource, Resources +from workload import Resource, Resources, Workload, TaskGraph +from data import BaseWorkloadLoader from data.tpch_loader import TpchLoader from utils import setup_logging, setup_csv_logging from rpc import erdos_scheduler_pb2 @@ -52,8 +54,21 @@ class DataLoader(Enum): TPCH = "tpch" -class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): +class WorkloadLoader(BaseWorkloadLoader): + def __init__(self) -> None: + self._workload = Workload.empty() + + def add_task_graph(self, task_graph: TaskGraph): + self._workload.add_task_graph(task_graph) + + def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: + return self._workload + + +RegisteredTaskGraph = namedtuple('RegisteredTaskGraph', ['graph', 'stage_id_mapping']) + +class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): def __init__(self) -> None: self._logger = setup_logging( name=__name__, @@ -74,15 +89,18 @@ def __init__(self) -> None: self._master_uri = None self._initialization_time = None self._data_loaders = {} - # TODO: refactor self._data_loaders[DataLoader.TPCH] = TpchLoader( path=FLAGS.tpch_query_dag_spec, flags=FLAGS ) self._simulator = None + self._workload_loader = None + self._scheduler = EDFScheduler() self._registered_task_graphs = {} + super().__init__() + async def RegisterFramework(self, request, context): sim_time = self.__sim_time() @@ -106,11 +124,13 @@ async def RegisterFramework(self, request, context): name=f"WorkerPool_{parsed_uri.netloc}", _logger=self._logger, ) + self._workload_loader = WorkloadLoader() self._simulator = Simulator( scheduler=self._scheduler, worker_pools=WorkerPools( [worker_pool] ), # Maintain only one worker pool in the simulator + workload_loader=self._workload_loader, ) msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}" @@ -136,6 +156,7 @@ async def DeregisterFramework(self, request, context): self._initialization_time = None self._master_uri = None + self._workload_loader = None self._simulator = None msg = f"[{sim_time}] Successfully deregistered the framework at {request.uri}" self._logger.info(msg) @@ -216,7 +237,7 @@ async def RegisterTaskGraph(self, request, context): success=False, message=msg, num_executors=0 ) - self._registered_task_graphs[request.id] = (task_graph, stage_id_mapping) + self._registered_task_graphs[request.id] = RegisteredTaskGraph(task_graph, stage_id_mapping) msg = f"[{sim_time}] Registered task graph (id={request.id}, name={request.name}) successfully" return erdos_scheduler_pb2.RegisterTaskGraphResponse( @@ -229,12 +250,27 @@ async def RegisterEnvironmentReady(self, request, context): sim_time = self.__sim_time() if not self.__framework_registered(): - msg = f"[{sim_time}] Trying to notify that the environment is ready for task graph (id={request.id}, name={request.name}) but no framework is registered yet" + msg = f"[{sim_time}] Trying to notify that the environment is ready for task graph (id={request.id}) but no framework is registered yet" return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=False, message=msg, ) + self._workload_loader.add_task_graph(self._registered_task_graphs[request.id].graph) + self._simulator._event_queue.add_event( + Event( + event_type=EventType.UPDATE_WORKLOAD, + time=sim_time, + ) + ) + + msg = f"[{sim_time}] Successfully marked environment as ready for task graph (id={request.id})" + self._logger.info(msg) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=True, + message=msg, + ) + async def RegisterWorker(self, request, context): sim_time = self.__sim_time() @@ -275,7 +311,7 @@ async def RegisterWorker(self, request, context): async def GetPlacements(self, request, context): pass - + async def NotifyTaskCompletion(self, request, context): pass diff --git a/tests/test_service.py b/tests/test_service.py index 2f4ff84f..64c34723 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -63,7 +63,7 @@ def test_service(): # Register a correct TaskGraph request = erdos_scheduler_pb2.RegisterTaskGraphRequest( - id="task-graph", + id="task-graph-0", name="TPCH Query 4 50 50", timestamp=1234567890, dependencies=[ @@ -78,12 +78,24 @@ def test_service(): assert ( response.success and re.search( - r"Registered task graph \(id=task-graph, name=TPCH Query 4 50 50\) successfully", + r"Registered task graph \(id=task-graph-0, name=TPCH Query 4 50 50\) successfully", response.message, ) and response.num_executors == 10 ) + # Mark the environment as ready + request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( + id="task-graph-0", + num_executors=10, + timestamp=1234567890, + ) + response = stub.RegisterEnvironmentReady(request) + assert ( + response.success + and re.search(r"Successfully marked environment as ready for task graph \(id=task-graph-0\)", response.message) + ) + # Deregister framework request = erdos_scheduler_pb2.DeregisterFrameworkRequest( name="test_framework", uri="http://localhost/test", timestamp=1234567890 From 0025f3c2cc166de43fe990ac2d025ae0abc5ecc2 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 6 Nov 2024 23:38:15 -0500 Subject: [PATCH 027/128] init impl for get placements, readme with spark-erdos setup --- rpc/service.py | 33 ++++++++++++++- rpc/spark_erdos_setup.md | 87 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 rpc/spark_erdos_setup.md diff --git a/rpc/service.py b/rpc/service.py index e276b9e0..9bd6f979 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -98,6 +98,8 @@ def __init__(self) -> None: self._scheduler = EDFScheduler() self._registered_task_graphs = {} + # TODO: (Dhruv) Can we get the currently active task graphs directly from the workload object? + self._active_task_graphs = set() super().__init__() @@ -240,6 +242,10 @@ async def RegisterTaskGraph(self, request, context): self._registered_task_graphs[request.id] = RegisteredTaskGraph(task_graph, stage_id_mapping) msg = f"[{sim_time}] Registered task graph (id={request.id}, name={request.name}) successfully" + # Add the task graph to the active task graphs if registration is successful + self._active_task_graphs.add(request.id) + print(f"[{sim_time}] Task graph with {request.id} registered successfully. Active task graphs: {self._active_task_graphs}") + return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, message=msg, @@ -310,8 +316,33 @@ async def RegisterWorker(self, request, context): ) async def GetPlacements(self, request, context): - pass + sim_time = self.__sim_time() + + # TODO (Dhruv): Can add check to verify that framework and worker are registered + + # Check if the task graph is registered + if request.id not in self._registered_task_graphs: + msg = f"[{sim_time}] Task graph with id {request.task_graph_id} not registered." + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, + message=msg, + ) + # Check if the task graph is active + if request.id not in self._active_task_graphs: + msg = f"[{sim_time}] Task graph with id {request.task_graph_id} not active." + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, + message=msg, + ) + + print(f"[{sim_time}] Processing GetPlacements request for task graph with id {request.id}") + + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=f"Placements for taskgraph {request.id} returned successfully.", + ) + async def NotifyTaskCompletion(self, request, context): pass diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md new file mode 100644 index 00000000..3a6e47c1 --- /dev/null +++ b/rpc/spark_erdos_setup.md @@ -0,0 +1,87 @@ +# Setup Instructions for Spark Mirror and ERDOS + +This README provides step-by-step instructions to set up the environment, compile the Spark Mirror, and build the ERDOS scheduling simulator. + +## Prerequisites +- Conda +- Git +- [Java Development Kit (JDK) 17.0.9](https://openjdk.org/) + +--- + +## Step 0: Create Conda Environment + +```bash +conda create -n dg_erdos python=3.10 +``` + +### Activate the environment: +```bash +conda activate dg_erdos +``` + +### If jdk17.0.9 isn't installed, install it for dg_erdos +```bash +conda install -c conda-forge openjdk=17.0.9 +``` + + +## Step 1: Clone spark mirror with submodules +```bash +git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive +``` + +### Verify branch +Verify or set current branch `erdos-spark-integration` + +### Start sbt shell +NOTE: `JAVA_HOME` should automatically get set to `/serenity/scratch/dgarg/anaconda3/envs/dg_erdos/lib/jvm` + +```bash +./build/sbt +``` + +### Switch to project spark-core +```bash +project core +``` +### Compile and then package +```bash +compile +package +``` + +## Step 2: Compile ERDOS +### Clone repo +```bash +git clone https://github.com/erdos-project/erdos-scheduling-simulator.git --recursive +``` + +### Install requirements for the package +```bash +pip install -r requirements.txt +``` + +### Set `GUROBI_DIR` +```bash +export GUROBI_DIR=/serenity/scratch/dgarg/gurobi/gurobi1003/linux64 +``` + +### Build inside schedulers/tetrisched/build/ +```bash +export CMAKE_INSTALL_MODE=ABS_SYMLINK + +cmake .. -DINSTALL_GTEST=OFF -DTBB_INSTALL=OFF +``` + +* Verify that python bindings are written to the new `dg_erdos` conda env and not some old env + +### Run make +```bash +make -j install +``` + +### Test that simulator works with `simple_av_workload` +```bash +python3 main.py --flagfile=configs/simple_av_workload.conf > experiments/simple_av_workload_test.output +``` \ No newline at end of file From a7f18e3ce54f2c51fb0b098e17e6495ba7c22b35 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Tue, 12 Nov 2024 11:13:53 -0500 Subject: [PATCH 028/128] WIP: service changes to handle first tpch taskgraph --- data/tpch_loader.py | 2 +- rpc/service.py | 27 +++++++++++++++++++------- rpc/spark_erdos_setup.md | 25 ++++++++++++++++++++++++ simulator.py | 42 +++++++++++++++++++++++++++++++++++++--- 4 files changed, 85 insertions(+), 11 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index ee5f5404..23395007 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -195,7 +195,7 @@ def __make_work_profile( strategy=ExecutionStrategy( resources=resources, batch_size=1, - runtime=EventTime(runtime, EventTime.Unit.US), + runtime=EventTime(runtime, EventTime.Unit.S), ), ) return WorkProfile( diff --git a/rpc/service.py b/rpc/service.py index 9bd6f979..4e32c572 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -49,7 +49,6 @@ "The initial number of executors that are requested by each Spark application.", ) - class DataLoader(Enum): TPCH = "tpch" @@ -114,7 +113,7 @@ async def RegisterFramework(self, request, context): message=msg, ) - t = int(time.time()) + t = time.time_ns() // 1000 # Current epoch time in microseconds framework_name = request.name self._master_uri = request.uri self._initialization_time = EventTime(t, EventTime.Unit.US) @@ -133,6 +132,7 @@ async def RegisterFramework(self, request, context): [worker_pool] ), # Maintain only one worker pool in the simulator workload_loader=self._workload_loader, + _flags=FLAGS, ) msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}" @@ -289,7 +289,7 @@ async def RegisterWorker(self, request, context): # TODO(Sukrit): Right now, we drop the memory requirements, we should use # them to do multi-dimensional packing using STRL. - cpu_resource = Resource(name="Slot_CPU") + cpu_resource = Resource(name="Slot") worker_resources = Resources( resource_vector={ # TODO(elton): handle override worker cpu count? @@ -345,11 +345,21 @@ async def GetPlacements(self, request, context): async def NotifyTaskCompletion(self, request, context): pass + + async def _tick_simulator(self): + while True: + if self._simulator is not None: + current_sim_time = self.__sim_time() + self._logger.debug(f"[{current_sim_time}] Simulator tick real timestamp: {time.time_ns() // 1000}") + self._simulator.tick(tick_until=current_sim_time) + else: + print("Simulator instance is None") + await asyncio.sleep(0.1) # 100 milliseconds def __sim_time(self) -> EventTime: if self._initialization_time is None: return EventTime.invalid() - ts = int(time.time()) + ts = time.time_ns() // 1000 # Current epoch time in microseconds ts = EventTime(ts, EventTime.Unit.US) return ts - self._initialization_time @@ -371,8 +381,12 @@ def main(_argv): loop = asyncio.get_event_loop() server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) - erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(Servicer(), server) + servicer = Servicer() + erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(servicer, server) server.add_insecure_port(f"[::]:{FLAGS.port}") + + # Schedule the periodic tick_simulator task + loop.create_task(servicer._tick_simulator()) try: loop.run_until_complete(serve(server)) @@ -381,6 +395,5 @@ def main(_argv): finally: loop.close() - if __name__ == "__main__": - app.run(main) + app.run(main) \ No newline at end of file diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index 3a6e47c1..9f810264 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -84,4 +84,29 @@ make -j install ### Test that simulator works with `simple_av_workload` ```bash python3 main.py --flagfile=configs/simple_av_workload.conf > experiments/simple_av_workload_test.output +``` + + +## Step 3: Using the Spark-Erdos service + +From the base directory: + +### Install the requirements +```bash +pip install -r rpc/requirements.txt +``` + +### Run protoc to generate the service and message definitions using +```bash +python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto +``` + +### Run the service using +```bash +python -m rpc.service +``` + +### Run the test_service script using +```bash +python -m rpc.dg_test_service ``` \ No newline at end of file diff --git a/simulator.py b/simulator.py index 2f32f454..c2121468 100644 --- a/simulator.py +++ b/simulator.py @@ -514,13 +514,47 @@ def simulate(self) -> None: self.__step(step_size=time_until_next_event) if self.__handle_event(self._event_queue.next()): break + + def tick(self, tick_until: EventTime) -> None: + """Run the simulator loop to execute enqueued events until a particular time""" + tick_size = tick_until - self._simulator_time + self._logger.debug( + "[%s] Running the simulator loop for a tick of %s to reach time of %s.", + self._simulator_time.to(EventTime.Unit.US).time, + tick_size, + tick_until.to(EventTime.Unit.US).time, + ) + running_tasks = self._worker_pools.get_placed_tasks() + + if running_tasks: + min_task_remaining_time = min( + map(attrgetter("remaining_time"), running_tasks) + ) + self._logger.debug( + "[%s] The minimum task remaining time was %s, " + "and the tick size was %s.", + self._simulator_time.to(EventTime.Unit.US).time, + min_task_remaining_time, + tick_size, + ) + + if min_task_remaining_time < tick_size: + self.__step(step_size=min_task_remaining_time) + self._logger.info(f"Completed step upto min_task_remaining_time of {min_task_remaining_time}") + else: + self.__step(step_size=tick_size) + self._logger.info(f"Had running tasks but completed step of tick_size: {tick_size}") + if self.__handle_event(self._event_queue.next()): + return + else: + self.__step(step_size=tick_size) + self._logger.info(f"No running tasks, completed step of tick_size: {tick_size}") + if self.__handle_event(self._event_queue.next()): + return def time_until_next_event(self) -> EventTime: return self._event_queue.peek().time - self._simulator_time - def step(self, step_size: EventTime) -> None: - self.__step(step_size=step_size) - def __handle_scheduler_start(self, event: Event) -> None: """Handle the SCHEDULER_START event. The method invokes the scheduler, and adds a SCHEDULER_FINISHED event to the event queue. @@ -1592,6 +1626,7 @@ def __handle_update_workload(self, event: Event) -> None: else self._simulator_time + self._workload_update_interval ), ) + # TODO: (DG) It keeps adding update workload events. Should we handle this from the service? self._event_queue.add_event(next_update_event) self._logger.info( "[%s] Added %s to the event queue.", @@ -1885,6 +1920,7 @@ def __get_next_scheduler_event( next_event is None and len(schedulable_tasks) == 0 and len(running_tasks) == 0 + and self._workload_update_interval != EventTime(9999999, EventTime.Unit.US) ): self._logger.info( "[%s] There are no currently schedulable tasks, no running tasks, " From 942ead738315ce51f37ea6b5dbb5f3c49769d83b Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 13 Nov 2024 16:59:50 -0500 Subject: [PATCH 029/128] Fix the tick() function to dequeue all events upto n, pass runtime_unit in tpch_loader --- data/tpch_loader.py | 11 +++++-- rpc/service.py | 4 ++- simulator.py | 71 ++++++++++++++++++++++++++------------------- 3 files changed, 53 insertions(+), 33 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 23395007..7d7cd2d6 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -33,17 +33,24 @@ class TpchLoader: Args: path (`str`): Path to a YAML file specifying the TPC-H query DAGs + runtime_unit (`EventTime.Unit`): The unit of the runtime flags (`absl.flags`): The flags used to initialize the app, if any """ - def __init__(self, path: Path, flags: "absl.flags"): + def __init__( + self, + path: Path, + flags: "absl.flags", + runtime_unit: EventTime.Unit = EventTime.Unit.US + ): self._logger = setup_logging( name=self.__class__.__name__, log_dir=flags.log_dir, log_file=flags.log_file_name, log_level=flags.log_level, ) + self._runtime_unit = runtime_unit self._flags = flags # Load the TPC-H DAG structures @@ -195,7 +202,7 @@ def __make_work_profile( strategy=ExecutionStrategy( resources=resources, batch_size=1, - runtime=EventTime(runtime, EventTime.Unit.S), + runtime=EventTime(runtime, self._runtime_unit), ), ) return WorkProfile( diff --git a/rpc/service.py b/rpc/service.py index 4e32c572..92f500fa 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -89,7 +89,9 @@ def __init__(self) -> None: self._initialization_time = None self._data_loaders = {} self._data_loaders[DataLoader.TPCH] = TpchLoader( - path=FLAGS.tpch_query_dag_spec, flags=FLAGS + path=FLAGS.tpch_query_dag_spec, + flags=FLAGS, + runtime_unit=EventTime.Unit.S, ) self._simulator = None self._workload_loader = None diff --git a/simulator.py b/simulator.py index c2121468..43548276 100644 --- a/simulator.py +++ b/simulator.py @@ -516,41 +516,52 @@ def simulate(self) -> None: break def tick(self, tick_until: EventTime) -> None: - """Run the simulator loop to execute enqueued events until a particular time""" - tick_size = tick_until - self._simulator_time - self._logger.debug( - "[%s] Running the simulator loop for a tick of %s to reach time of %s.", - self._simulator_time.to(EventTime.Unit.US).time, - tick_size, - tick_until.to(EventTime.Unit.US).time, - ) - running_tasks = self._worker_pools.get_placed_tasks() - - if running_tasks: - min_task_remaining_time = min( - map(attrgetter("remaining_time"), running_tasks) - ) + """Run the simulator loop to execute enqueued events until a particular time.""" + + while self._simulator_time < tick_until: + tick_size = tick_until - self._simulator_time self._logger.debug( - "[%s] The minimum task remaining time was %s, " - "and the tick size was %s.", + "[%s] Running the simulator loop to reach time of %s with remaining tick size %s.", self._simulator_time.to(EventTime.Unit.US).time, - min_task_remaining_time, + tick_until.to(EventTime.Unit.US).time, tick_size, ) - - if min_task_remaining_time < tick_size: - self.__step(step_size=min_task_remaining_time) - self._logger.info(f"Completed step upto min_task_remaining_time of {min_task_remaining_time}") + + # Get current running tasks + running_tasks = self._worker_pools.get_placed_tasks() + + # Determine the next step size based on the smallest remaining task time or tick size + if running_tasks: + min_task_remaining_time = min( + map(attrgetter("remaining_time"), running_tasks) + ) + step_size = min(min_task_remaining_time, tick_size) + self._logger.debug( + "[%s] The minimum task remaining time was %s, " + "and the selected step size was %s.", + self._simulator_time.to(EventTime.Unit.US).time, + min_task_remaining_time, + step_size, + ) else: - self.__step(step_size=tick_size) - self._logger.info(f"Had running tasks but completed step of tick_size: {tick_size}") - if self.__handle_event(self._event_queue.next()): - return - else: - self.__step(step_size=tick_size) - self._logger.info(f"No running tasks, completed step of tick_size: {tick_size}") - if self.__handle_event(self._event_queue.next()): - return + step_size = tick_size # No tasks running, use the entire tick size + + # Step the simulator forward + self.__step(step_size=step_size) + self._logger.info( + f"Stepped simulator by {step_size}, new simulator time is {self._simulator_time}" + ) + + # Check and process the next event in the queue if it exists and is due + while ((self._event_queue.peek() is not None) and ( + self._event_queue.peek().time <= self._simulator_time)): + event = self._event_queue.next() + if self.__handle_event(event): + return # Exit early if event handling requires it + + self._logger.info( + f"Finished processing simulator events upto time: {self._simulator_time}" + ) def time_until_next_event(self) -> EventTime: return self._event_queue.peek().time - self._simulator_time From 2b0c895599ef218300d9daab58e335ae0f5b1536 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 08:27:53 -0500 Subject: [PATCH 030/128] refactor runtime unit setting --- data/tpch_loader.py | 10 ++++----- rpc/service.py | 53 ++++++++++++++++++++++++++++----------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 7d7cd2d6..24af3c84 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -33,7 +33,6 @@ class TpchLoader: Args: path (`str`): Path to a YAML file specifying the TPC-H query DAGs - runtime_unit (`EventTime.Unit`): The unit of the runtime flags (`absl.flags`): The flags used to initialize the app, if any """ @@ -42,15 +41,13 @@ def __init__( self, path: Path, flags: "absl.flags", - runtime_unit: EventTime.Unit = EventTime.Unit.US - ): + ): self._logger = setup_logging( name=self.__class__.__name__, log_dir=flags.log_dir, log_file=flags.log_file_name, log_level=flags.log_level, ) - self._runtime_unit = runtime_unit self._flags = flags # Load the TPC-H DAG structures @@ -71,6 +68,7 @@ def make_task_graph( dataset_size: Optional[int] = None, max_executors_per_job: Optional[int] = None, min_task_runtime: Optional[int] = None, + runtime_unit: EventTime.Unit = EventTime.Unit.US, ) -> Tuple[TaskGraph, Dict[int, int]]: if profile_type is None: profile_type = self._flags.tpch_profile_type @@ -113,6 +111,7 @@ def make_task_graph( node_name=node["name"], max_executors_per_job=max_executors_per_job, min_task_runtime=min_task_runtime, + runtime_unit=runtime_unit, ) job = Job( name=node["name"], @@ -149,6 +148,7 @@ def __make_work_profile( node_name: str, max_executors_per_job: int, min_task_runtime: int, + runtime_unit: EventTime, ) -> WorkProfile: profile = profiler_data[int(node_name)] @@ -202,7 +202,7 @@ def __make_work_profile( strategy=ExecutionStrategy( resources=resources, batch_size=1, - runtime=EventTime(runtime, self._runtime_unit), + runtime=EventTime(runtime, runtime_unit), ), ) return WorkProfile( diff --git a/rpc/service.py b/rpc/service.py index 92f500fa..b4339db9 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -49,6 +49,7 @@ "The initial number of executors that are requested by each Spark application.", ) + class DataLoader(Enum): TPCH = "tpch" @@ -59,12 +60,12 @@ def __init__(self) -> None: def add_task_graph(self, task_graph: TaskGraph): self._workload.add_task_graph(task_graph) - + def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: return self._workload -RegisteredTaskGraph = namedtuple('RegisteredTaskGraph', ['graph', 'stage_id_mapping']) +RegisteredTaskGraph = namedtuple("RegisteredTaskGraph", ["graph", "stage_id_mapping"]) class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): @@ -91,7 +92,6 @@ def __init__(self) -> None: self._data_loaders[DataLoader.TPCH] = TpchLoader( path=FLAGS.tpch_query_dag_spec, flags=FLAGS, - runtime_unit=EventTime.Unit.S, ) self._simulator = None self._workload_loader = None @@ -115,7 +115,7 @@ async def RegisterFramework(self, request, context): message=msg, ) - t = time.time_ns() // 1000 # Current epoch time in microseconds + t = time.time_ns() // 1000 # Current epoch time in microseconds framework_name = request.name self._master_uri = request.uri self._initialization_time = EventTime(t, EventTime.Unit.US) @@ -229,6 +229,7 @@ async def RegisterTaskGraph(self, request, context): dependencies=dependencies, dataset_size=dataset_size, max_executors_per_job=max_executors_per_job, + runtime_unit=EvenTime.Unit.S, ) except Exception as e: msg = f"[{sim_time}] Failed to load TPCH query {query_num}. Exception: {e}" @@ -241,13 +242,17 @@ async def RegisterTaskGraph(self, request, context): success=False, message=msg, num_executors=0 ) - self._registered_task_graphs[request.id] = RegisteredTaskGraph(task_graph, stage_id_mapping) + self._registered_task_graphs[request.id] = RegisteredTaskGraph( + task_graph, stage_id_mapping + ) msg = f"[{sim_time}] Registered task graph (id={request.id}, name={request.name}) successfully" # Add the task graph to the active task graphs if registration is successful self._active_task_graphs.add(request.id) - print(f"[{sim_time}] Task graph with {request.id} registered successfully. Active task graphs: {self._active_task_graphs}") - + print( + f"[{sim_time}] Task graph with {request.id} registered successfully. Active task graphs: {self._active_task_graphs}" + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, message=msg, @@ -264,14 +269,16 @@ async def RegisterEnvironmentReady(self, request, context): message=msg, ) - self._workload_loader.add_task_graph(self._registered_task_graphs[request.id].graph) + self._workload_loader.add_task_graph( + self._registered_task_graphs[request.id].graph + ) self._simulator._event_queue.add_event( Event( event_type=EventType.UPDATE_WORKLOAD, time=sim_time, ) ) - + msg = f"[{sim_time}] Successfully marked environment as ready for task graph (id={request.id})" self._logger.info(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( @@ -308,6 +315,7 @@ async def RegisterWorker(self, request, context): self.__get_worker_pool().add_workers([worker]) msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})" + self._logger.info(msg) return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, @@ -319,9 +327,9 @@ async def RegisterWorker(self, request, context): async def GetPlacements(self, request, context): sim_time = self.__sim_time() - + # TODO (Dhruv): Can add check to verify that framework and worker are registered - + # Check if the task graph is registered if request.id not in self._registered_task_graphs: msg = f"[{sim_time}] Task graph with id {request.task_graph_id} not registered." @@ -329,7 +337,7 @@ async def GetPlacements(self, request, context): success=False, message=msg, ) - + # Check if the task graph is active if request.id not in self._active_task_graphs: msg = f"[{sim_time}] Task graph with id {request.task_graph_id} not active." @@ -337,9 +345,11 @@ async def GetPlacements(self, request, context): success=False, message=msg, ) - - print(f"[{sim_time}] Processing GetPlacements request for task graph with id {request.id}") - + + print( + f"[{sim_time}] Processing GetPlacements request for task graph with id {request.id}" + ) + return erdos_scheduler_pb2.GetPlacementsResponse( success=True, message=f"Placements for taskgraph {request.id} returned successfully.", @@ -347,12 +357,14 @@ async def GetPlacements(self, request, context): async def NotifyTaskCompletion(self, request, context): pass - + async def _tick_simulator(self): while True: if self._simulator is not None: current_sim_time = self.__sim_time() - self._logger.debug(f"[{current_sim_time}] Simulator tick real timestamp: {time.time_ns() // 1000}") + self._logger.debug( + f"[{current_sim_time}] Simulator tick real timestamp: {time.time_ns() // 1000}" + ) self._simulator.tick(tick_until=current_sim_time) else: print("Simulator instance is None") @@ -361,7 +373,7 @@ async def _tick_simulator(self): def __sim_time(self) -> EventTime: if self._initialization_time is None: return EventTime.invalid() - ts = time.time_ns() // 1000 # Current epoch time in microseconds + ts = time.time_ns() // 1000 # Current epoch time in microseconds ts = EventTime(ts, EventTime.Unit.US) return ts - self._initialization_time @@ -386,7 +398,7 @@ def main(_argv): servicer = Servicer() erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(servicer, server) server.add_insecure_port(f"[::]:{FLAGS.port}") - + # Schedule the periodic tick_simulator task loop.create_task(servicer._tick_simulator()) @@ -397,5 +409,6 @@ def main(_argv): finally: loop.close() + if __name__ == "__main__": - app.run(main) \ No newline at end of file + app.run(main) From 7a41bfee2b47a6f3726834c51cfed2774fd1a2b1 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 08:38:05 -0500 Subject: [PATCH 031/128] reomve microsecond granularity for now, will add support later --- rpc/service.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index b4339db9..07fc7abb 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -115,10 +115,10 @@ async def RegisterFramework(self, request, context): message=msg, ) - t = time.time_ns() // 1000 # Current epoch time in microseconds + t = int(time.time()) framework_name = request.name self._master_uri = request.uri - self._initialization_time = EventTime(t, EventTime.Unit.US) + self._initialization_time = EventTime(t, EventTime.Unit.S) # Update sim_time now that initialization_time is set sim_time = self.__sim_time() @@ -229,7 +229,7 @@ async def RegisterTaskGraph(self, request, context): dependencies=dependencies, dataset_size=dataset_size, max_executors_per_job=max_executors_per_job, - runtime_unit=EvenTime.Unit.S, + runtime_unit=EventTime.Unit.S, ) except Exception as e: msg = f"[{sim_time}] Failed to load TPCH query {query_num}. Exception: {e}" @@ -249,9 +249,6 @@ async def RegisterTaskGraph(self, request, context): # Add the task graph to the active task graphs if registration is successful self._active_task_graphs.add(request.id) - print( - f"[{sim_time}] Task graph with {request.id} registered successfully. Active task graphs: {self._active_task_graphs}" - ) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, @@ -320,7 +317,6 @@ async def RegisterWorker(self, request, context): return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, message=msg, - # TODO(elton): not sure why we need to set this here cores=FLAGS.virtualized_cores, memory=FLAGS.virtualized_memory * 1024, ) @@ -368,13 +364,13 @@ async def _tick_simulator(self): self._simulator.tick(tick_until=current_sim_time) else: print("Simulator instance is None") - await asyncio.sleep(0.1) # 100 milliseconds + await asyncio.sleep(1) def __sim_time(self) -> EventTime: if self._initialization_time is None: return EventTime.invalid() - ts = time.time_ns() // 1000 # Current epoch time in microseconds - ts = EventTime(ts, EventTime.Unit.US) + ts = int(time.time()) + ts = EventTime(ts, EventTime.Unit.S) return ts - self._initialization_time def __framework_registered(self): From 6170819b92a417878a995bbbdc004060884d1a2a Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 08:44:37 -0500 Subject: [PATCH 032/128] rename sim_time to stime to make it less confusing --- rpc/service.py | 71 ++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 07fc7abb..2cc5a996 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -105,10 +105,10 @@ def __init__(self) -> None: super().__init__() async def RegisterFramework(self, request, context): - sim_time = self.__sim_time() + stime = self.__stime() if self.__framework_registered(): - msg = f"[{sim_time}] Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" + msg = f"[{stime}] Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" self._logger.error(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse( success=False, @@ -119,8 +119,7 @@ async def RegisterFramework(self, request, context): framework_name = request.name self._master_uri = request.uri self._initialization_time = EventTime(t, EventTime.Unit.S) - # Update sim_time now that initialization_time is set - sim_time = self.__sim_time() + stime = self.__stime() parsed_uri = urlparse(self._master_uri) worker_pool = WorkerPool( @@ -137,22 +136,22 @@ async def RegisterFramework(self, request, context): _flags=FLAGS, ) - msg = f"[{sim_time}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}" + msg = f"[{stime}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}" self._logger.info(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg) async def DeregisterFramework(self, request, context): - sim_time = self.__sim_time() + stime = self.__stime() if not self.__framework_registered(): - msg = f"Trying to deregister a framework at {request.uri} but no framework has been registered yet." + msg = f"[{stime}] Trying to deregister a framework at {request.uri} but no framework has been registered yet." self._logger.error(msg) return erdos_scheduler_pb2.DeregisterFrameworkResponse( success=False, message=msg ) if self._master_uri != request.uri: - msg = f"Trying to deregister the framework at {request.uri} but the registered framework is at {self._master_uri}" + msg = f"[{stime}] Trying to deregister the framework at {request.uri} but the registered framework is at {self._master_uri}" self._logger.error(msg) return erdos_scheduler_pb2.DeregisterFrameworkResponse( success=False, message=msg @@ -162,7 +161,7 @@ async def DeregisterFramework(self, request, context): self._master_uri = None self._workload_loader = None self._simulator = None - msg = f"[{sim_time}] Successfully deregistered the framework at {request.uri}" + msg = f"[{stime}] Successfully deregistered the framework at {request.uri}" self._logger.info(msg) return erdos_scheduler_pb2.DeregisterFrameworkResponse( success=True, message=msg @@ -175,17 +174,17 @@ async def DeregisterDriver(self, request, context): pass async def RegisterTaskGraph(self, request, context): - sim_time = self.__sim_time() + stime = self.__stime() if not self.__framework_registered(): - msg = f"[{sim_time}] Trying to register a task graph (id={request.id}, name={request.name}) but no framework has been registered yet." + msg = f"[{stime}] Trying to register a task graph (id={request.id}, name={request.name}) but no framework has been registered yet." self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 ) if request.id in self._registered_task_graphs: - msg = f"[{sim_time}] The task graph (id={request.id}, name={request.name}) is already registered" + msg = f"[{stime}] The task graph (id={request.id}, name={request.name}) is already registered" self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 @@ -196,7 +195,7 @@ async def RegisterTaskGraph(self, request, context): # Parse request name query_parts = request.name.split() if len(query_parts) != 3 and len(query_parts) != 5: - msg = f"[{sim_time}] Invalid TPCH query request" + msg = f"[{stime}] Invalid TPCH query request" return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 ) @@ -225,19 +224,19 @@ async def RegisterTaskGraph(self, request, context): ].make_task_graph( id=id, query_num=query_num, - release_time=sim_time, + release_time=stime, dependencies=dependencies, dataset_size=dataset_size, max_executors_per_job=max_executors_per_job, runtime_unit=EventTime.Unit.S, ) except Exception as e: - msg = f"[{sim_time}] Failed to load TPCH query {query_num}. Exception: {e}" + msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 ) else: - msg = f"[{sim_time}] The service only supports TPCH queries" + msg = f"[{stime}] The service only supports TPCH queries" return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 ) @@ -245,7 +244,7 @@ async def RegisterTaskGraph(self, request, context): self._registered_task_graphs[request.id] = RegisteredTaskGraph( task_graph, stage_id_mapping ) - msg = f"[{sim_time}] Registered task graph (id={request.id}, name={request.name}) successfully" + msg = f"[{stime}] Registered task graph (id={request.id}, name={request.name}) successfully" # Add the task graph to the active task graphs if registration is successful self._active_task_graphs.add(request.id) @@ -257,10 +256,10 @@ async def RegisterTaskGraph(self, request, context): ) async def RegisterEnvironmentReady(self, request, context): - sim_time = self.__sim_time() + stime = self.__stime() if not self.__framework_registered(): - msg = f"[{sim_time}] Trying to notify that the environment is ready for task graph (id={request.id}) but no framework is registered yet" + msg = f"[{stime}] Trying to notify that the environment is ready for task graph (id={request.id}) but no framework is registered yet" return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=False, message=msg, @@ -272,11 +271,11 @@ async def RegisterEnvironmentReady(self, request, context): self._simulator._event_queue.add_event( Event( event_type=EventType.UPDATE_WORKLOAD, - time=sim_time, + time=stime, ) ) - msg = f"[{sim_time}] Successfully marked environment as ready for task graph (id={request.id})" + msg = f"[{stime}] Successfully marked environment as ready for task graph (id={request.id})" self._logger.info(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=True, @@ -284,10 +283,10 @@ async def RegisterEnvironmentReady(self, request, context): ) async def RegisterWorker(self, request, context): - sim_time = self.__sim_time() + stime = self.__stime() if not self.__framework_registered(): - msg = f"[{sim_time}] Trying to register a worker (id={request.id}, name={request.name}) but no framework is registered yet" + msg = f"[{stime}] Trying to register a worker (id={request.id}, name={request.name}) but no framework is registered yet" return erdos_scheduler_pb2.RegisterWorkerResponse( success=False, message=msg ) @@ -311,7 +310,7 @@ async def RegisterWorker(self, request, context): self.__get_worker_pool().add_workers([worker]) - msg = f"[{sim_time}] Registered worker (id={request.id}, name={request.name})" + msg = f"[{stime}] Registered worker (id={request.id}, name={request.name})" self._logger.info(msg) return erdos_scheduler_pb2.RegisterWorkerResponse( @@ -322,13 +321,13 @@ async def RegisterWorker(self, request, context): ) async def GetPlacements(self, request, context): - sim_time = self.__sim_time() + stime = self.__stime() # TODO (Dhruv): Can add check to verify that framework and worker are registered # Check if the task graph is registered if request.id not in self._registered_task_graphs: - msg = f"[{sim_time}] Task graph with id {request.task_graph_id} not registered." + msg = f"[{stime}] Task graph with id {request.task_graph_id} not registered." return erdos_scheduler_pb2.GetPlacementsResponse( success=False, message=msg, @@ -336,16 +335,12 @@ async def GetPlacements(self, request, context): # Check if the task graph is active if request.id not in self._active_task_graphs: - msg = f"[{sim_time}] Task graph with id {request.task_graph_id} not active." + msg = f"[{stime}] Task graph with id {request.task_graph_id} not active." return erdos_scheduler_pb2.GetPlacementsResponse( success=False, message=msg, ) - print( - f"[{sim_time}] Processing GetPlacements request for task graph with id {request.id}" - ) - return erdos_scheduler_pb2.GetPlacementsResponse( success=True, message=f"Placements for taskgraph {request.id} returned successfully.", @@ -357,16 +352,18 @@ async def NotifyTaskCompletion(self, request, context): async def _tick_simulator(self): while True: if self._simulator is not None: - current_sim_time = self.__sim_time() - self._logger.debug( - f"[{current_sim_time}] Simulator tick real timestamp: {time.time_ns() // 1000}" - ) - self._simulator.tick(tick_until=current_sim_time) + stime = self.__stime() + self._logger.debug(f"[{stime}] Simulator tick") + self._simulator.tick(tick_until=stime) else: print("Simulator instance is None") await asyncio.sleep(1) - def __sim_time(self) -> EventTime: + def __stime(self) -> EventTime: + """ + Time as viewed by the service. Starts when a framework is registered + and ends when it is deregistered. + """ if self._initialization_time is None: return EventTime.invalid() ts = int(time.time()) From d76453a392f84a7c0289a4b6d4254098af88bdff Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 10:52:47 -0500 Subject: [PATCH 033/128] refactor tick --- simulator.py | 80 +++++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 58 deletions(-) diff --git a/simulator.py b/simulator.py index 43548276..5038399f 100644 --- a/simulator.py +++ b/simulator.py @@ -5,7 +5,7 @@ from enum import Enum from functools import total_ordering from operator import attrgetter, itemgetter -from typing import Mapping, Optional, Sequence +from typing import Mapping, Optional, Sequence, Callable import absl # noqa: F401 @@ -469,14 +469,30 @@ def dry_run(self) -> None: ) def simulate(self) -> None: - """Run the simulator loop. + """Run the simulator loop to fixpoint. This loop requires the `Workload` to be populated with the `TaskGraph`s whose execution is to be simulated using the Scheduler. """ + self.__simulate_f(lambda _: True) + + def tick(self, until: EventTime) -> None: + """Tick the simulator until the specified time""" + self.__simulate_f(should_continue=lambda et: et <= until) + + def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: + """Helper function to run the simulator until a predicate is satisfied. + + The predicate (`should_continue`) receives the time of the next event + in the queue, using which it can use to decide whether or not to + simulate. + """ # Run the simulator loop. while True: - time_until_next_event = self.time_until_next_event() + if not should_continue(self._event_queue.peek().time): + break + + time_until_next_event = self.__time_until_next_event() # If there are any running tasks, step through the execution of the # Simulator until the closest remaining time. @@ -514,56 +530,8 @@ def simulate(self) -> None: self.__step(step_size=time_until_next_event) if self.__handle_event(self._event_queue.next()): break - - def tick(self, tick_until: EventTime) -> None: - """Run the simulator loop to execute enqueued events until a particular time.""" - - while self._simulator_time < tick_until: - tick_size = tick_until - self._simulator_time - self._logger.debug( - "[%s] Running the simulator loop to reach time of %s with remaining tick size %s.", - self._simulator_time.to(EventTime.Unit.US).time, - tick_until.to(EventTime.Unit.US).time, - tick_size, - ) - - # Get current running tasks - running_tasks = self._worker_pools.get_placed_tasks() - - # Determine the next step size based on the smallest remaining task time or tick size - if running_tasks: - min_task_remaining_time = min( - map(attrgetter("remaining_time"), running_tasks) - ) - step_size = min(min_task_remaining_time, tick_size) - self._logger.debug( - "[%s] The minimum task remaining time was %s, " - "and the selected step size was %s.", - self._simulator_time.to(EventTime.Unit.US).time, - min_task_remaining_time, - step_size, - ) - else: - step_size = tick_size # No tasks running, use the entire tick size - - # Step the simulator forward - self.__step(step_size=step_size) - self._logger.info( - f"Stepped simulator by {step_size}, new simulator time is {self._simulator_time}" - ) - - # Check and process the next event in the queue if it exists and is due - while ((self._event_queue.peek() is not None) and ( - self._event_queue.peek().time <= self._simulator_time)): - event = self._event_queue.next() - if self.__handle_event(event): - return # Exit early if event handling requires it - - self._logger.info( - f"Finished processing simulator events upto time: {self._simulator_time}" - ) - def time_until_next_event(self) -> EventTime: + def __time_until_next_event(self) -> EventTime: return self._event_queue.peek().time - self._simulator_time def __handle_scheduler_start(self, event: Event) -> None: @@ -1559,9 +1527,7 @@ def __handle_update_workload(self, event: Event) -> None: f"__handle_update_workload called with event of type {event.type}." ) if not self._workload_loader: - raise ValueError( - "UPDATE_WORKLOAD event enqueued without workload_loader" - ) + raise ValueError("UPDATE_WORKLOAD event enqueued without workload_loader") updated_workload = self._workload_loader.get_next_workload( current_time=self._simulator_time @@ -1637,8 +1603,7 @@ def __handle_update_workload(self, event: Event) -> None: else self._simulator_time + self._workload_update_interval ), ) - # TODO: (DG) It keeps adding update workload events. Should we handle this from the service? - self._event_queue.add_event(next_update_event) + # self._event_queue.add_event(next_update_event) self._logger.info( "[%s] Added %s to the event queue.", self._simulator_time.time, @@ -1931,7 +1896,6 @@ def __get_next_scheduler_event( next_event is None and len(schedulable_tasks) == 0 and len(running_tasks) == 0 - and self._workload_update_interval != EventTime(9999999, EventTime.Unit.US) ): self._logger.info( "[%s] There are no currently schedulable tasks, no running tasks, " From 35df499f69842b96765145eaa783904fb3ea0995 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 10:53:46 -0500 Subject: [PATCH 034/128] update rpc service to invoke new tick, update test --- rpc/service.py | 9 ++++++--- tests/test_service.py | 16 +++++++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 2cc5a996..e78dcb8d 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -96,7 +96,8 @@ def __init__(self) -> None: self._simulator = None self._workload_loader = None - self._scheduler = EDFScheduler() + # TODO: address this + self._scheduler = EDFScheduler(runtime=EventTime(0, EventTime.Unit.US)) self._registered_task_graphs = {} # TODO: (Dhruv) Can we get the currently active task graphs directly from the workload object? @@ -327,7 +328,9 @@ async def GetPlacements(self, request, context): # Check if the task graph is registered if request.id not in self._registered_task_graphs: - msg = f"[{stime}] Task graph with id {request.task_graph_id} not registered." + msg = ( + f"[{stime}] Task graph with id {request.task_graph_id} not registered." + ) return erdos_scheduler_pb2.GetPlacementsResponse( success=False, message=msg, @@ -354,7 +357,7 @@ async def _tick_simulator(self): if self._simulator is not None: stime = self.__stime() self._logger.debug(f"[{stime}] Simulator tick") - self._simulator.tick(tick_until=stime) + self._simulator.tick(until=stime) else: print("Simulator instance is None") await asyncio.sleep(1) diff --git a/tests/test_service.py b/tests/test_service.py index 64c34723..650d33ea 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -91,10 +91,20 @@ def test_service(): timestamp=1234567890, ) response = stub.RegisterEnvironmentReady(request) - assert ( - response.success - and re.search(r"Successfully marked environment as ready for task graph \(id=task-graph-0\)", response.message) + assert response.success and re.search( + r"Successfully marked environment as ready for task graph \(id=task-graph-0\)", + response.message, + ) + + time.sleep(16) + + # Get placements for the task + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-0", ) + response = stub.GetPlacements(request) + assert False # Deregister framework request = erdos_scheduler_pb2.DeregisterFrameworkRequest( From acf6a4e63e356630c59f6e42fb95620c5fe680e2 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 11:00:30 -0500 Subject: [PATCH 035/128] add comment explaining scheduler runtime zero setting --- rpc/service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/rpc/service.py b/rpc/service.py index e78dcb8d..fcabc08b 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -96,7 +96,8 @@ def __init__(self) -> None: self._simulator = None self._workload_loader = None - # TODO: address this + # There is a bug in the simulator that causes tasks to be placed + # in the past if the scheduler runtime is not zero self._scheduler = EDFScheduler(runtime=EventTime(0, EventTime.Unit.US)) self._registered_task_graphs = {} From 08aa855a65bb21fa3887f063a4a391b970712cee Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 11:14:15 -0500 Subject: [PATCH 036/128] oops forget to add comment on update workload --- simulator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simulator.py b/simulator.py index 5038399f..dd93230e 100644 --- a/simulator.py +++ b/simulator.py @@ -1603,6 +1603,7 @@ def __handle_update_workload(self, event: Event) -> None: else self._simulator_time + self._workload_update_interval ), ) + # TODO(elton): Handle this properly # self._event_queue.add_event(next_update_event) self._logger.info( "[%s] Added %s to the event queue.", From 106945fc0932421152a082abfde5aac7113e1a2a Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 14:02:04 -0500 Subject: [PATCH 037/128] refactor naming in tpch loader --- data/tpch_loader.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 24af3c84..20d590cf 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -79,8 +79,6 @@ def make_task_graph( if min_task_runtime is None: min_task_runtime = self._flags.tpch_min_task_runtime - query_name = f"Q{query_num}" - # Normalize dependencies if dependencies is None: dependencies = self._graphs[query_num] @@ -92,11 +90,11 @@ def make_task_graph( if "children" in node: node["children"] = [deps_mapping[c] for c in node["children"]] self._logger.info( - f"Mapped dependencies for TPC-H query {query_name} as {deps_mapping}." + f"Mapped dependencies for TPC-H query {query_name(query_num)} as {deps_mapping}." ) # Construct a JobGraph - job_graph = JobGraph(name=f"{query_name}[{id}]") + job_graph = JobGraph(name=task_graph_name(query_num, id)) profiler_data = get_all_stage_info_for_query( query_num, profile_type, @@ -107,7 +105,7 @@ def make_task_graph( for node in dependencies: worker_profile = self.__make_work_profile( profiler_data=profiler_data, - query_name=query_name, + query_num=query_num, node_name=node["name"], max_executors_per_job=max_executors_per_job, min_task_runtime=min_task_runtime, @@ -137,14 +135,16 @@ def make_task_graph( _flags=self._flags, ) - self._logger.info(f"Constructed TaskGraph for TPC-H query {query_name}.") + self._logger.info( + f"Constructed TaskGraph for TPC-H query {query_name(query_num)}." + ) return task_graph, deps_mapping def __make_work_profile( self, profiler_data: Dict[int, Dict[str, Any]], - query_name: str, + query_num: int, node_name: str, max_executors_per_job: int, min_task_runtime: int, @@ -164,7 +164,7 @@ def __make_work_profile( "%s@%s: num_slots (%s) > max_executors_per_job (%s). Converted " "(slots,runtime) from (%s,%s) to (%s, %s)", node_name, - query_name, + query_name(query_num), profiled_task_slots, max_executors_per_job, profiled_task_slots, @@ -183,7 +183,7 @@ def __make_work_profile( "%s@%s: runtime (%s) < min_task_runtime (%s). Converted " "(slots,runtime) from (%s,%s) to (%s, %s)", node_name, - query_name, + query_name(query_num), _runtime, min_task_runtime, num_slots, @@ -206,7 +206,7 @@ def __make_work_profile( ), ) return WorkProfile( - name=f"{query_name}_{node_name}_execution_profile", + name=f"{query_name(query_num)}_{node_name}_execution_profile", execution_strategies=execution_strategies, ) @@ -395,6 +395,14 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: return self._workload +def query_name(query_num: int) -> str: + return f"Q{query_num}" + + +def task_graph_name(query_num: int, id: any) -> str: + return f"{query_name(query_num)}[{id}]" + + def make_release_policy( release_policy, release_policy_args, rng, seed, randomize_start_time=(0, 0) ): From 52828b78edb23cfa5270ec242520d0c1e204c8ef Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 16:11:59 -0500 Subject: [PATCH 038/128] fix documentation error in workload/tasks.py --- workload/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workload/tasks.py b/workload/tasks.py index 48691ae7..aae07301 100644 --- a/workload/tasks.py +++ b/workload/tasks.py @@ -53,7 +53,7 @@ class Task(object): Args: name (`str`): The name of the computation (typically the callback of the ERDOS operator. - task_graph_name (`str`): The name of the TaskGraph that this Task belongs to. + task_graph (`str`): The name of the TaskGraph that this Task belongs to. job (`Job`): The job that created this particular task. deadline (`EventTime`): The absolute deadline by which the task should complete. profile (`WorkProfile`): A profile of the computation that the Task is supposed From 2dac435d3acdd1ffd1790041f01593b131c0f860 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 16:12:22 -0500 Subject: [PATCH 039/128] add support for returning current task graph placements from simulator --- rpc/service.py | 20 +++++++++++++++----- simulator.py | 42 +++++++++++++++++++++++++++++++++++++++++- tests/test_service.py | 6 +++--- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index fcabc08b..a537f6cb 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -246,7 +246,7 @@ async def RegisterTaskGraph(self, request, context): self._registered_task_graphs[request.id] = RegisteredTaskGraph( task_graph, stage_id_mapping ) - msg = f"[{stime}] Registered task graph (id={request.id}, name={request.name}) successfully" + msg = f"[{stime}] Registered task graph '{task_graph.name}' successfully" # Add the task graph to the active task graphs if registration is successful self._active_task_graphs.add(request.id) @@ -267,9 +267,11 @@ async def RegisterEnvironmentReady(self, request, context): message=msg, ) - self._workload_loader.add_task_graph( - self._registered_task_graphs[request.id].graph - ) + # TODO: check if the task graph exists + + task_graph = self._registered_task_graphs[request.id].graph + + self._workload_loader.add_task_graph(task_graph) self._simulator._event_queue.add_event( Event( event_type=EventType.UPDATE_WORKLOAD, @@ -277,7 +279,7 @@ async def RegisterEnvironmentReady(self, request, context): ) ) - msg = f"[{stime}] Successfully marked environment as ready for task graph (id={request.id})" + msg = f"[{stime}] Successfully marked environment as ready for task graph '{task_graph.name}'" self._logger.info(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=True, @@ -345,6 +347,14 @@ async def GetPlacements(self, request, context): message=msg, ) + task_graph = self._registered_task_graphs[request.id].graph + placements = self._simulator.get_current_placements_for_task_graph( + task_graph.name + ) + self._logger.info( + f"Received the following placements for '{task_graph.name}': {placements}" + ) + return erdos_scheduler_pb2.GetPlacementsResponse( success=True, message=f"Placements for taskgraph {request.id} returned successfully.", diff --git a/simulator.py b/simulator.py index dd93230e..131eb0c7 100644 --- a/simulator.py +++ b/simulator.py @@ -5,7 +5,7 @@ from enum import Enum from functools import total_ordering from operator import attrgetter, itemgetter -from typing import Mapping, Optional, Sequence, Callable +from typing import Mapping, Optional, Sequence, Callable, Dict, List import absl # noqa: F401 @@ -344,6 +344,10 @@ def event_representation_filter(record): self._next_scheduler_event = None self._last_scheduler_placements: Optional[Placements] = None + # Stores current placements for tasks of a task graph + # task_graph => {task_id => placement} + self._current_task_graph_placements: Dict[str, Dict[str, Placement]] = {} + # A Cache from the TaskID to a future Placement event in the EventQueue. # The Simulator uses this bookkeeping to revoke / invalidate decisions made # by the past scheduler invocations. @@ -531,6 +535,13 @@ def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: if self.__handle_event(self._event_queue.next()): break + def get_current_placements_for_task_graph( + self, task_graph_name: str + ) -> List[Placement]: + if task_graph_name not in self._current_task_graph_placements: + raise ValueError(f"Task graph '{task_graph_name}' does not exist") + return list(self._current_task_graph_placements[task_graph_name].values()) + def __time_until_next_event(self) -> EventTime: return self._event_queue.peek().time - self._simulator_time @@ -1173,6 +1184,10 @@ def __handle_task_finished(self, event: Event) -> None: event.task.worker_pool_id ) task_placed_at_worker_pool.remove_task(current_time=event.time, task=event.task) + + # Remove the task from it's task graph's current placements + del self._current_task_graph_placements[event.task.task_graph][event.task.id] + event.task.finish() # Log the TASK_FINISHED event into the CSV. @@ -1193,6 +1208,10 @@ def __handle_task_finished(self, event: Event) -> None: if task_graph.deadline > event.time else event.time - task_graph.deadline ) + + # Remove task graph from current task graph placements map + del self._current_task_graph_placements[event.task.task_graph] + self._csv_logger.debug( f"{event.time.time},TASK_GRAPH_FINISHED,{task_graph.name}," f"{task_graph.deadline.to(EventTime.Unit.US).time}," @@ -1410,6 +1429,9 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: "[%s] Placed %s on %s.", event.time.time, task, worker_pool ) del self._future_placement_events[task.id] + self._current_task_graph_placements[task.task_graph][ + task.id + ] = event.placement else: next_placement_time = event.time + EventTime(1, EventTime.Unit.US) next_placement_event = Event( @@ -1565,6 +1587,24 @@ def __handle_update_workload(self, event: Event) -> None: len(releasable_tasks), ) + # Add task graph entry in self._current_task_graph_placements to + # track its task placements + for task_graph_name, task_graph in self._workload.task_graphs.items(): + # In addition to newly added task graphs, self._workload also + # contains all previously released task graphs. + # + # So, we guard the addition of the entry on two conditions: + # (1) The task graph doesn't have an entry (we don't want to + # nuke an existing one) + # (2) The task graph is not complete (we only keep the entry + # alive while the task graph is running to avoid a memory + # leak) + if ( + task_graph_name not in self._current_task_graph_placements + and not task_graph.is_complete() + ): + self._current_task_graph_placements[task_graph_name] = {} + # # Add the TaskGraphRelease events into the system. # for task_graph_name, task_graph in self._workload.task_graphs.items(): # event = Event( diff --git a/tests/test_service.py b/tests/test_service.py index 650d33ea..9fcb09ca 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -78,7 +78,7 @@ def test_service(): assert ( response.success and re.search( - r"Registered task graph \(id=task-graph-0, name=TPCH Query 4 50 50\) successfully", + r"Registered task graph 'Q4\[task-graph-0\]@1' successfully", response.message, ) and response.num_executors == 10 @@ -92,11 +92,11 @@ def test_service(): ) response = stub.RegisterEnvironmentReady(request) assert response.success and re.search( - r"Successfully marked environment as ready for task graph \(id=task-graph-0\)", + r"Successfully marked environment as ready for task graph 'Q4\[task-graph-0\]@1'", response.message, ) - time.sleep(16) + time.sleep(3) # Get placements for the task request = erdos_scheduler_pb2.GetPlacementsRequest( From 023805ce373000f8af84ecc79db7849e6b5ec054 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 16:58:34 -0500 Subject: [PATCH 040/128] fix logging --- rpc/service.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index a537f6cb..599adb25 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -101,8 +101,6 @@ def __init__(self) -> None: self._scheduler = EDFScheduler(runtime=EventTime(0, EventTime.Unit.US)) self._registered_task_graphs = {} - # TODO: (Dhruv) Can we get the currently active task graphs directly from the workload object? - self._active_task_graphs = set() super().__init__() @@ -248,9 +246,6 @@ async def RegisterTaskGraph(self, request, context): ) msg = f"[{stime}] Registered task graph '{task_graph.name}' successfully" - # Add the task graph to the active task graphs if registration is successful - self._active_task_graphs.add(request.id) - return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, message=msg, @@ -262,15 +257,21 @@ async def RegisterEnvironmentReady(self, request, context): if not self.__framework_registered(): msg = f"[{stime}] Trying to notify that the environment is ready for task graph (id={request.id}) but no framework is registered yet" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=False, message=msg, ) - # TODO: check if the task graph exists + if request.id not in self._registered_task_graphs: + msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, + message=msg, + ) task_graph = self._registered_task_graphs[request.id].graph - self._workload_loader.add_task_graph(task_graph) self._simulator._event_queue.add_event( Event( @@ -291,6 +292,7 @@ async def RegisterWorker(self, request, context): if not self.__framework_registered(): msg = f"[{stime}] Trying to register a worker (id={request.id}, name={request.name}) but no framework is registered yet" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterWorkerResponse( success=False, message=msg ) @@ -316,7 +318,6 @@ async def RegisterWorker(self, request, context): msg = f"[{stime}] Registered worker (id={request.id}, name={request.name})" self._logger.info(msg) - return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, message=msg, @@ -331,17 +332,8 @@ async def GetPlacements(self, request, context): # Check if the task graph is registered if request.id not in self._registered_task_graphs: - msg = ( - f"[{stime}] Task graph with id {request.task_graph_id} not registered." - ) - return erdos_scheduler_pb2.GetPlacementsResponse( - success=False, - message=msg, - ) - - # Check if the task graph is active - if request.id not in self._active_task_graphs: - msg = f"[{stime}] Task graph with id {request.task_graph_id} not active." + msg = f"[{stime}] Task graph with id '{request.id}' not registered." + self._logger.error(msg) return erdos_scheduler_pb2.GetPlacementsResponse( success=False, message=msg, @@ -357,7 +349,7 @@ async def GetPlacements(self, request, context): return erdos_scheduler_pb2.GetPlacementsResponse( success=True, - message=f"Placements for taskgraph {request.id} returned successfully.", + message=f"Placements for task graph '{request.id}' returned successfully", ) async def NotifyTaskCompletion(self, request, context): From 0dfd34e556ed3681722c007044e4f11ff1ae98a5 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 19 Nov 2024 17:24:33 -0500 Subject: [PATCH 041/128] construct and return placements response --- rpc/service.py | 28 +++++++++++++++++++++++++--- simulator.py | 3 ++- tests/test_service.py | 9 ++++++++- 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 599adb25..5a152767 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -339,17 +339,34 @@ async def GetPlacements(self, request, context): message=msg, ) - task_graph = self._registered_task_graphs[request.id].graph - placements = self._simulator.get_current_placements_for_task_graph( + task_graph, stage_id_mapping = self._registered_task_graphs[request.id] + sim_placements = self._simulator.get_current_placements_for_task_graph( task_graph.name ) + self._logger.info( - f"Received the following placements for '{task_graph.name}': {placements}" + f"Received the following placements for '{task_graph.name}': {sim_placements}" ) + # Construct response. Notably, we apply stage-id mapping + placements = [] + for placement in sim_placements: + worker_id = self.__get_worker_id() + task_id = stage_id_mapping[placement.task.name] + cores = sum(x for _, x in placement.execution_strategy.resources.resources) + placements.append( + { + "worker_id": worker_id, + "application_id": request.id, + "task_id": int(task_id), + "cores": cores, + } + ) + return erdos_scheduler_pb2.GetPlacementsResponse( success=True, message=f"Placements for task graph '{request.id}' returned successfully", + placements=placements, ) async def NotifyTaskCompletion(self, request, context): @@ -383,6 +400,11 @@ def __get_worker_pool(self): # Simulator maintains only one worker pool, so this should be fine return next(iter(self._simulator._worker_pools.worker_pools)) + def __get_worker_id(self): + # We return the name here because we register the worker id from + # Spark as the name of the worker in the worker pool + return self.__get_worker_pool().workers[0].name + async def serve(server): await server.start() diff --git a/simulator.py b/simulator.py index 131eb0c7..56d9b6f1 100644 --- a/simulator.py +++ b/simulator.py @@ -493,7 +493,8 @@ def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: """ # Run the simulator loop. while True: - if not should_continue(self._event_queue.peek().time): + top = self._event_queue.peek() + if top and not should_continue(top.time): break time_until_next_event = self.__time_until_next_event() diff --git a/tests/test_service.py b/tests/test_service.py index 9fcb09ca..826dd7dc 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -104,7 +104,14 @@ def test_service(): id="task-graph-0", ) response = stub.GetPlacements(request) - assert False + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "1234" and placement.application_id == "task-graph-0" + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {0, 1} # Deregister framework request = erdos_scheduler_pb2.DeregisterFrameworkRequest( From 05b8f4717a4d16effc9a23de516543ce04cbb745 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 11:02:11 -0500 Subject: [PATCH 042/128] do not return placements if task graph is complete --- rpc/service.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/rpc/service.py b/rpc/service.py index 5a152767..4235f794 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -340,6 +340,16 @@ async def GetPlacements(self, request, context): ) task_graph, stage_id_mapping = self._registered_task_graphs[request.id] + + # Check if the task graph is active + if task_graph.is_complete(): + msg = f"[{stime}] Task graph '{task_graph.name}' is complete. No more placements to provide." + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, + message=msg, + ) + sim_placements = self._simulator.get_current_placements_for_task_graph( task_graph.name ) From 14ec521a120ecf9c024dca04d93f7ad4e28dc535 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 17:37:04 -0500 Subject: [PATCH 043/128] fix placement time bug in simulator --- rpc/service.py | 10 +++++++--- simulator.py | 5 ++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 4235f794..af003d34 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,3 +1,4 @@ +import sys import time import asyncio from concurrent import futures @@ -96,9 +97,11 @@ def __init__(self) -> None: self._simulator = None self._workload_loader = None - # There is a bug in the simulator that causes tasks to be placed - # in the past if the scheduler runtime is not zero - self._scheduler = EDFScheduler(runtime=EventTime(0, EventTime.Unit.US)) + self._scheduler = EDFScheduler( + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) self._registered_task_graphs = {} @@ -272,6 +275,7 @@ async def RegisterEnvironmentReady(self, request, context): ) task_graph = self._registered_task_graphs[request.id].graph + self._workload_loader.add_task_graph(task_graph) self._simulator._event_queue.add_event( Event( diff --git a/simulator.py b/simulator.py index 56d9b6f1..00ab649b 100644 --- a/simulator.py +++ b/simulator.py @@ -494,7 +494,7 @@ def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: # Run the simulator loop. while True: top = self._event_queue.peek() - if top and not should_continue(top.time): + if not top or not should_continue(top.time): break time_until_next_event = self.__time_until_next_event() @@ -1406,6 +1406,7 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: assert ( worker_pool is not None ), f"No WorkerPool found with ID: {event.placement.worker_pool_id}." + success = worker_pool.place_task( task, execution_strategy=event.placement.execution_strategy, @@ -2076,6 +2077,8 @@ def __run_scheduler(self, event: Event) -> Event: # Calculate the time at which the placements need to be applied. placement_time = event.time + placements.runtime + for placement in placements: + placement._placement_time = placement_time # Save the placements until the placement time arrives. self._last_scheduler_placements = placements From 062aa5c6fbea6123567488ec5aecce8d593f0e8f Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 18:04:38 -0500 Subject: [PATCH 044/128] implement orchestrated mode --- main.py | 5 +++++ rpc/service.py | 7 +++++-- simulator.py | 45 +++++++++++++++++++++++++-------------------- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/main.py b/main.py index 2c42e338..b501c13f 100644 --- a/main.py +++ b/main.py @@ -116,6 +116,11 @@ "If set to default (-1), then the Simulator will automatically choose an interval " "based on the set of released tasks in the previous iteration.", ) +flags.DEFINE_bool( + "orchestrated", + False, + "Runs the simulator in orchestrated mode. Currently used by the ERDOS service.", +) # Benchmark related flags. flags.DEFINE_integer( diff --git a/rpc/service.py b/rpc/service.py index af003d34..dd8960b1 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -130,6 +130,9 @@ async def RegisterFramework(self, request, context): _logger=self._logger, ) self._workload_loader = WorkloadLoader() + + # Enable orchestrated mode + FLAGS.orchestrated = True self._simulator = Simulator( scheduler=self._scheduler, worker_pools=WorkerPools( @@ -225,7 +228,7 @@ async def RegisterTaskGraph(self, request, context): task_graph, stage_id_mapping = self._data_loaders[ DataLoader.TPCH ].make_task_graph( - id=id, + id=request.id, query_num=query_num, release_time=stime, dependencies=dependencies, @@ -248,7 +251,7 @@ async def RegisterTaskGraph(self, request, context): task_graph, stage_id_mapping ) msg = f"[{stime}] Registered task graph '{task_graph.name}' successfully" - + self._logger.info(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, message=msg, diff --git a/simulator.py b/simulator.py index 00ab649b..da98965e 100644 --- a/simulator.py +++ b/simulator.py @@ -379,6 +379,9 @@ def event_representation_filter(record): self._finished_task_graphs = 0 self._missed_task_graph_deadlines = 0 + # Is the simulator orchestrated? + self._orchestrated = _flags.orchestrated + # Initialize the event queue. # To make the system continue working the loop, we add three events: # - SIMULATOR_START: A notional event start the simulator and log into the CSV. @@ -503,7 +506,7 @@ def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: # Simulator until the closest remaining time. running_tasks = self._worker_pools.get_placed_tasks() - if len(running_tasks) > 0: + if not self._orchestrated and len(running_tasks) > 0: # There are running tasks, figure out the minimum remaining # time across all the tasks. min_task_remaining_time = min( @@ -1062,18 +1065,19 @@ def count_placed_tasks(placements: Placements): # Reset the available tasks and the last task placement. self._last_scheduler_placements = None - # The scheduler has finished its execution, insert an event for the next - # invocation of the scheduler. - next_sched_event = self.__get_next_scheduler_event( - event, - self._scheduler_frequency, - self._last_scheduler_start_time, - self._loop_timeout, - ) - self._event_queue.add_event(next_sched_event) - self._logger.info( - "[%s] Added %s to the event queue.", event.time.time, next_sched_event - ) + if not self._orchestrated: + # The scheduler has finished its execution, insert an event for the next + # invocation of the scheduler. + next_sched_event = self.__get_next_scheduler_event( + event, + self._scheduler_frequency, + self._last_scheduler_start_time, + self._loop_timeout, + ) + self._event_queue.add_event(next_sched_event) + self._logger.info( + "[%s] Added %s to the event queue.", event.time.time, next_sched_event + ) # Now that all the tasks are placed, ask the simulator to log the resource # utilization and quit later, if requested. @@ -1776,13 +1780,14 @@ def __step(self, step_size: EventTime = EventTime(1, EventTime.Unit.US)) -> None self._simulator_time.time, [event.task.unique_name for event in task_finished_events], ) - for task_finished_event in task_finished_events: - self._event_queue.add_event(task_finished_event) - self._logger.info( - "[%s] Added %s to the event queue.", - self._simulator_time.time, - task_finished_event, - ) + if not self._orchestrated: + for task_finished_event in task_finished_events: + self._event_queue.add_event(task_finished_event) + self._logger.info( + "[%s] Added %s to the event queue.", + self._simulator_time.time, + task_finished_event, + ) def __get_next_scheduler_event( self, From dfabf711a4ac1774d29c040a0b39c5b7ed37a308 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 18:17:20 -0500 Subject: [PATCH 045/128] remove redundant checks --- rpc/service.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index dd8960b1..f1a06b49 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -261,14 +261,6 @@ async def RegisterTaskGraph(self, request, context): async def RegisterEnvironmentReady(self, request, context): stime = self.__stime() - if not self.__framework_registered(): - msg = f"[{stime}] Trying to notify that the environment is ready for task graph (id={request.id}) but no framework is registered yet" - self._logger.error(msg) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, - message=msg, - ) - if request.id not in self._registered_task_graphs: msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" self._logger.error(msg) @@ -335,11 +327,9 @@ async def RegisterWorker(self, request, context): async def GetPlacements(self, request, context): stime = self.__stime() - # TODO (Dhruv): Can add check to verify that framework and worker are registered - # Check if the task graph is registered if request.id not in self._registered_task_graphs: - msg = f"[{stime}] Task graph with id '{request.id}' not registered." + msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist" self._logger.error(msg) return erdos_scheduler_pb2.GetPlacementsResponse( success=False, From 76117c6bbefd7c5c4782d012699c3ccfe0fa7701 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 22:11:28 -0500 Subject: [PATCH 046/128] enqueue scheduler start event in register task graph --- rpc/service.py | 85 +++++++++++++++++++++++++++++++++++++++++++++----- simulator.py | 3 ++ 2 files changed, 80 insertions(+), 8 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index f1a06b49..ad2bc942 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -130,7 +130,7 @@ async def RegisterFramework(self, request, context): _logger=self._logger, ) self._workload_loader = WorkloadLoader() - + # Enable orchestrated mode FLAGS.orchestrated = True self._simulator = Simulator( @@ -177,7 +177,23 @@ async def RegisterDriver(self, request, context): pass async def DeregisterDriver(self, request, context): - pass + if request.id not in self._registered_task_graphs: + msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" + self._logger.error(msg) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=False, + message=msg, + ) + + task_graph, _ = self._registered_task_graphs[request.id] + del self._registered_task_graphs[request.id] + + msg = f"[{stime}] Successfully de-registered driver for task graph {task_graph.name}" + self._logger.info(msg) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=True, + message=msg, + ) async def RegisterTaskGraph(self, request, context): stime = self.__stime() @@ -272,12 +288,18 @@ async def RegisterEnvironmentReady(self, request, context): task_graph = self._registered_task_graphs[request.id].graph self._workload_loader.add_task_graph(task_graph) - self._simulator._event_queue.add_event( - Event( - event_type=EventType.UPDATE_WORKLOAD, - time=stime, - ) + + update_workload_event = Event( + event_type=EventType.UPDATE_WORKLOAD, + time=stime, ) + self._simulator._event_queue.add_event(update_workload_event) + + scheduler_start_event = Event( + event_type=EventType.SCHEDULER_START, + time=stime.to(EventTime.Unit.US), + ) + self._simulator._event_queue.add_event(scheduler_start_event) msg = f"[{stime}] Successfully marked environment as ready for task graph '{task_graph.name}'" self._logger.info(msg) @@ -377,7 +399,54 @@ async def GetPlacements(self, request, context): ) async def NotifyTaskCompletion(self, request, context): - pass + stime = self.__stime() + + # Check if the task graph is registered + if request.application_id not in self._registered_task_graphs: + msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist" + self._logger.error(msg) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=msg, + ) + + task_graph, stage_id_mapping = self._registered_task_graphs[ + request.application_id + ] + task = task_graph.get_task(stage_id_mapping[request.task_id]) + if task is None: + msg = f"[{stime}] Task '{task_id}' does not exist in the task graph '{task_graph.name}'" + self._logger.error(msg) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=msg, + ) + + actual_task_completion_time = stime + task.remaining_time.time + + task_finished_event = Event( + event_type=EventType.TASK_FINISHED, + time=EventTime(time=actual_task_completion_time, unit=EventTime.Unit.S), + task=task, + ) + self._simulator._event_queue.add_event(task_finished_event) + + scheduler_start_event = Event( + event_type=EventType.SCHEDULER_START, + time=EventTime(time=actual_task_completion_time, unit=EventTime.Unit.S).to( + EventTime.Unit.US + ), + ) + self._simulator._event_queue.add_event(scheduler_start_event) + + # TODO(elton): log info message + # TODO(elton): write a test program to give this a spin + + # TODO(elton): update log message + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=True, + message=f"NotifyTaskCompletion for taskgraph {request.application_id}", + ) async def _tick_simulator(self): while True: diff --git a/simulator.py b/simulator.py index da98965e..3077d86c 100644 --- a/simulator.py +++ b/simulator.py @@ -401,6 +401,9 @@ def event_representation_filter(record): sim_start_event, ) + if self._orchestrated: + return + # Second, create the UPDATE_WORKLOAD event to retrieve the latest Workload. if self._workload_loader: upate_workload_event = Event( From fcf21428c68bd7de8e4f2e975dab6311bc72369f Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 22:41:35 -0500 Subject: [PATCH 047/128] bug fixes --- rpc/service.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index ad2bc942..1819d0a8 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -12,7 +12,7 @@ from schedulers import EDFScheduler from simulator import Simulator, Event, EventTime, EventType from workers import Worker, WorkerPool, WorkerPools -from workload import Resource, Resources, Workload, TaskGraph +from workload import Resource, Resources, Workload, TaskGraph, TaskState from data import BaseWorkloadLoader from data.tpch_loader import TpchLoader from utils import setup_logging, setup_csv_logging @@ -415,37 +415,44 @@ async def NotifyTaskCompletion(self, request, context): ] task = task_graph.get_task(stage_id_mapping[request.task_id]) if task is None: - msg = f"[{stime}] Task '{task_id}' does not exist in the task graph '{task_graph.name}'" + msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{task_graph.name}'" self._logger.error(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, message=msg, ) - actual_task_completion_time = stime + task.remaining_time.time + if task.state != TaskState.RUNNING: + msg = f"[{stime}] Received task completion notification for task '{request.task_id}' but it is not running" + self._logger.error(msg) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=msg, + ) + + # HACK: The worker pool doesn't step every tick (probably should). So, the task.remaining_time is not accurate. We compute actual_task_completion then by getting the runtime from the profile, + actual_task_completion_time = ( + task.start_time + task.slowest_execution_strategy.runtime + ) task_finished_event = Event( event_type=EventType.TASK_FINISHED, - time=EventTime(time=actual_task_completion_time, unit=EventTime.Unit.S), + time=actual_task_completion_time, task=task, ) self._simulator._event_queue.add_event(task_finished_event) scheduler_start_event = Event( event_type=EventType.SCHEDULER_START, - time=EventTime(time=actual_task_completion_time, unit=EventTime.Unit.S).to( - EventTime.Unit.US - ), + time=actual_task_completion_time.to(EventTime.Unit.US), ) self._simulator._event_queue.add_event(scheduler_start_event) - # TODO(elton): log info message - # TODO(elton): write a test program to give this a spin - - # TODO(elton): update log message + msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{task_graph.name}'" + self._logger.info(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=True, - message=f"NotifyTaskCompletion for taskgraph {request.application_id}", + message=msg, ) async def _tick_simulator(self): From a6d72e4755894b97ebc29defb59faf67c4e7b34b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 20 Nov 2024 22:52:07 -0500 Subject: [PATCH 048/128] add a lock, haven't checked all places --- rpc/service.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 1819d0a8..f17c2120 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,3 +1,4 @@ +import threading import sys import time import asyncio @@ -104,6 +105,7 @@ def __init__(self) -> None: ) self._registered_task_graphs = {} + self._lock = threading.Lock() super().__init__() @@ -293,13 +295,14 @@ async def RegisterEnvironmentReady(self, request, context): event_type=EventType.UPDATE_WORKLOAD, time=stime, ) - self._simulator._event_queue.add_event(update_workload_event) - scheduler_start_event = Event( event_type=EventType.SCHEDULER_START, time=stime.to(EventTime.Unit.US), ) - self._simulator._event_queue.add_event(scheduler_start_event) + + with self._lock: + self._simulator._event_queue.add_event(update_workload_event) + self._simulator._event_queue.add_event(scheduler_start_event) msg = f"[{stime}] Successfully marked environment as ready for task graph '{task_graph.name}'" self._logger.info(msg) @@ -369,9 +372,10 @@ async def GetPlacements(self, request, context): message=msg, ) - sim_placements = self._simulator.get_current_placements_for_task_graph( - task_graph.name - ) + with self._lock: + sim_placements = self._simulator.get_current_placements_for_task_graph( + task_graph.name + ) self._logger.info( f"Received the following placements for '{task_graph.name}': {sim_placements}" @@ -440,13 +444,14 @@ async def NotifyTaskCompletion(self, request, context): time=actual_task_completion_time, task=task, ) - self._simulator._event_queue.add_event(task_finished_event) - scheduler_start_event = Event( event_type=EventType.SCHEDULER_START, time=actual_task_completion_time.to(EventTime.Unit.US), ) - self._simulator._event_queue.add_event(scheduler_start_event) + + with self._lock: + self._simulator._event_queue.add_event(task_finished_event) + self._simulator._event_queue.add_event(scheduler_start_event) msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{task_graph.name}'" self._logger.info(msg) @@ -460,7 +465,8 @@ async def _tick_simulator(self): if self._simulator is not None: stime = self.__stime() self._logger.debug(f"[{stime}] Simulator tick") - self._simulator.tick(until=stime) + with self._lock: + self._simulator.tick(until=stime) else: print("Simulator instance is None") await asyncio.sleep(1) From 2ec812e4c28896b7e2d3bdb751f705f29694c4e9 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 21 Nov 2024 13:28:36 -0500 Subject: [PATCH 049/128] Add tests for notify task completion --- tests/test_service.py | 52 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/test_service.py b/tests/test_service.py index 826dd7dc..80ba5185 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -112,6 +112,58 @@ def test_service(): ) actual_task_ids.add(placement.task_id) assert actual_task_ids == {0, 1} + + # Wait for 3 seconds and trigger notify task completion for tasks 0 and 1 + time.sleep(3) + + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", + task_id=0, + timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", + task_id=1, + timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + # Wait for 20s to allow the service to execute task completion for fastest task + time.sleep(20) + + # Attempt to incorrectly notify task completion for task 3, which hasnt started yet + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", + task_id=3, + timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert not response.success + + # Wait 2s to allow the service to process the incorrect task completion + time.sleep(2) + + # Wait for 25s to allow the service to finish execution of task 0 + time.sleep(25) + + # This will unlock task 2, which should now be returned as a placement + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-0", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "1234" and placement.application_id == "task-graph-0" + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {2} # Deregister framework request = erdos_scheduler_pb2.DeregisterFrameworkRequest( From 42eee947c463f0984e7cbea8f3221da7fb592352 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 21 Nov 2024 13:14:26 -0500 Subject: [PATCH 050/128] add cancelled field to proto --- rpc/protos/rpc/erdos_scheduler.proto | 1 + 1 file changed, 1 insertion(+) diff --git a/rpc/protos/rpc/erdos_scheduler.proto b/rpc/protos/rpc/erdos_scheduler.proto index 494f5b49..0767be83 100644 --- a/rpc/protos/rpc/erdos_scheduler.proto +++ b/rpc/protos/rpc/erdos_scheduler.proto @@ -193,6 +193,7 @@ message Placement { string application_id = 2; uint32 task_id = 3; uint32 cores = 4; + bool cancelled = 5; // If the task (and thereby the task graph) should be cancelled } message GetPlacementsResponse { From 972a02076b1334d8bc047ab2030e514d45808b37 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 21 Nov 2024 13:18:09 -0500 Subject: [PATCH 051/128] populate cancelled field in rpc response --- rpc/service.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/rpc/service.py b/rpc/service.py index f17c2120..bf3fe27e 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -13,7 +13,7 @@ from schedulers import EDFScheduler from simulator import Simulator, Event, EventTime, EventType from workers import Worker, WorkerPool, WorkerPools -from workload import Resource, Resources, Workload, TaskGraph, TaskState +from workload import Resource, Resources, Workload, TaskGraph, TaskState, Placement from data import BaseWorkloadLoader from data.tpch_loader import TpchLoader from utils import setup_logging, setup_csv_logging @@ -387,12 +387,21 @@ async def GetPlacements(self, request, context): worker_id = self.__get_worker_id() task_id = stage_id_mapping[placement.task.name] cores = sum(x for _, x in placement.execution_strategy.resources.resources) + + if placement.placement_type not in ( + Placement.PlacementType.PLACE_TASK, + Placement.PlacementType.CANCEL_TASK, + ): + raise NotImplementedError + placements.append( { "worker_id": worker_id, "application_id": request.id, "task_id": int(task_id), "cores": cores, + "cancelled": placement.placement_type + == Placement.PlacementType.CANCEL_TASK, } ) From 1a7e7dd714a87aeb2934965f037994adeef90730 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 21 Nov 2024 13:52:01 -0500 Subject: [PATCH 052/128] Updates to spark erdos service documentation --- rpc/spark_erdos_setup.md | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index 9f810264..745ba669 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -12,15 +12,15 @@ This README provides step-by-step instructions to set up the environment, compil ## Step 0: Create Conda Environment ```bash -conda create -n dg_erdos python=3.10 +conda create -n python=3.10 ``` ### Activate the environment: ```bash -conda activate dg_erdos +conda activate ``` -### If jdk17.0.9 isn't installed, install it for dg_erdos +### If jdk17.0.9 isn't installed, install it for ```bash conda install -c conda-forge openjdk=17.0.9 ``` @@ -35,7 +35,7 @@ git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive Verify or set current branch `erdos-spark-integration` ### Start sbt shell -NOTE: `JAVA_HOME` should automatically get set to `/serenity/scratch/dgarg/anaconda3/envs/dg_erdos/lib/jvm` +NOTE: `JAVA_HOME` should automatically get set to `/path/to/anaconda3/envs//lib/jvm` ```bash ./build/sbt @@ -52,6 +52,9 @@ package ``` ## Step 2: Compile ERDOS +NOTE: The `erdos-scheduling-simulator` in Step 2 refers to the seperately cloned repository. It is not the `erdos-scheduling-simulator` submodule within +the spark-mirror repository. + ### Clone repo ```bash git clone https://github.com/erdos-project/erdos-scheduling-simulator.git --recursive @@ -74,7 +77,7 @@ export CMAKE_INSTALL_MODE=ABS_SYMLINK cmake .. -DINSTALL_GTEST=OFF -DTBB_INSTALL=OFF ``` -* Verify that python bindings are written to the new `dg_erdos` conda env and not some old env +* Verify that python bindings are written to the new `` conda env and not some old env ### Run make ```bash @@ -82,12 +85,15 @@ make -j install ``` ### Test that simulator works with `simple_av_workload` +NOTE: Might need to create `experiments` sub-directory if it doesnt already exist ```bash python3 main.py --flagfile=configs/simple_av_workload.conf > experiments/simple_av_workload_test.output ``` +The TaskGraph should complete and meet its deadline. -## Step 3: Using the Spark-Erdos service +## Step 3: Spark-Erdos service functionality test +NOTE: As in step 2, the `erdos-scheduling-simulator` here also refers to the seperately cloned repository. From the base directory: @@ -106,7 +112,9 @@ python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./ python -m rpc.service ``` -### Run the test_service script using +### Run local tests for the erdos-spark service +Note: Verify that `pytest` is installed in the ``. Else first do `pip install pytest`. Once installed, run the tests using: ```bash -python -m rpc.dg_test_service -``` \ No newline at end of file +pytest tests/test_service.py +``` + From b244b7e06907b754e9ddb7f9c61ae0b9c6bf31c9 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 21 Nov 2024 14:36:31 -0500 Subject: [PATCH 053/128] implement register driver --- rpc/service.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/rpc/service.py b/rpc/service.py index bf3fe27e..1562c2b3 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -176,7 +176,14 @@ async def DeregisterFramework(self, request, context): ) async def RegisterDriver(self, request, context): - pass + msg = ( + f"[{stime}] Successfully registered driver for task graph {task_graph.name}" + ) + self._logger.info(msg) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=True, + message=msg, + ) async def DeregisterDriver(self, request, context): if request.id not in self._registered_task_graphs: From 0afc85a46aa1935bec178651eb19cd1d28f87a60 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 21 Nov 2024 15:05:04 -0500 Subject: [PATCH 054/128] quick fixes in regsister and de-register driver --- rpc/service.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rpc/service.py b/rpc/service.py index 1562c2b3..03b72b15 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -176,8 +176,10 @@ async def DeregisterFramework(self, request, context): ) async def RegisterDriver(self, request, context): + stime = self.__stime() + msg = ( - f"[{stime}] Successfully registered driver for task graph {task_graph.name}" + f"[{stime}] Successfully registered driver for app id {request.id}" ) self._logger.info(msg) return erdos_scheduler_pb2.DeregisterDriverResponse( @@ -186,6 +188,8 @@ async def RegisterDriver(self, request, context): ) async def DeregisterDriver(self, request, context): + stime = self.__stime() + if request.id not in self._registered_task_graphs: msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" self._logger.error(msg) From fd389ea6b46304c30a6517654049d65dc6bb38e2 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 21 Nov 2024 16:51:03 -0500 Subject: [PATCH 055/128] Updates to spark erdos documentation --- rpc/spark_erdos_setup.md | 69 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index 745ba669..e44cc1ae 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -26,7 +26,8 @@ conda install -c conda-forge openjdk=17.0.9 ``` -## Step 1: Clone spark mirror with submodules +## Step 1: Setup `spark-mirror` +Clone the repository with submodules ```bash git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive ``` @@ -34,26 +35,46 @@ git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive ### Verify branch Verify or set current branch `erdos-spark-integration` -### Start sbt shell +### Verify env variable `SPARK_HOME` +Verify or set `SPARK_HOME` to point to the correct location of `spark-mirror`. + +### Verify env variable `JAVA_HOME` NOTE: `JAVA_HOME` should automatically get set to `/path/to/anaconda3/envs//lib/jvm` +### For first time compilation (entire package) +```bash +./build/sbt package +``` + +### For subsequent, quicker iterations +Start the interactive shell ```bash ./build/sbt ``` -### Switch to project spark-core +Switch to project spark-core ```bash project core ``` -### Compile and then package + +Compile and then package ```bash compile package ``` +### Fix guava versions for ERDOS-Spark integration +Fresh compile+package of spark adds `guava-14.0.1.jar` under `/path/to/spark_mirror/assembly/target/scala-2.13/jars/`. +This jar interferes with gRPC which requires a `guava-31` jar. To fix: +- Remove existing `guava-14` jar: `rm assembly/target/scala-2.13/jars/guava-14.0.1.jar` +- Run `./sbin/patch-erdos.sh` +- Verify `guava-31.0.1-jre.jar` exists under `assembly/target/scala-2.13/jars/` + + + ## Step 2: Compile ERDOS NOTE: The `erdos-scheduling-simulator` in Step 2 refers to the seperately cloned repository. It is not the `erdos-scheduling-simulator` submodule within -the spark-mirror repository. +the `spark-mirror` repository. ### Clone repo ```bash @@ -118,3 +139,41 @@ Note: Verify that `pytest` is installed in the ``. Else first do `pip pytest tests/test_service.py ``` +## Step 4: Running ERDOS with Spark backend + +### Start the service +```bash +python -m rpc.service +``` + +### Start all components of the spark cluster +Run the following commands from the root directory of the `spark-mirror` repository. + +Also, verify that environment variable `SPARK_HOME` is set correctly to point to the path of `spark_mirror` + +* Start Spark Master +```bash +./sbin/start-master.sh --host --properties-file /path/to/spark_mirror/conf/.conf +``` + +* Start Spark Worker +```bash +./sbin/start-worker.sh spark://:7077 --properties-file /path/to/spark_mirror/conf/.conf +``` + +* Start Spark History Server +```bash +./sbin/start-history-server.sh --properties-file /path/to/spark_mirror/conf/.conf +``` + +At this point, the spark framework should be registered with the erdos-service. + +### Viewing spark cluster status +TBD + +### Shutdown cluster + +* To stop all spark services after the experiment concludes +```bash +./sbin/stop-all.sh +``` \ No newline at end of file From 8e313a9f2c519b89266d56882ed00458de915821 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Thu, 21 Nov 2024 18:43:02 -0500 Subject: [PATCH 056/128] register driver bug fix --- rpc/service.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 03b72b15..08964f75 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -77,6 +77,7 @@ def __init__(self) -> None: log_dir=FLAGS.log_dir, log_file=FLAGS.log_file_name, log_level=FLAGS.log_level, + fmt='[%(asctime)s] {%(funcName)s:%(lineno)d} - %(message)s', ) self._csv_logger = setup_csv_logging( name=__name__, @@ -182,9 +183,10 @@ async def RegisterDriver(self, request, context): f"[{stime}] Successfully registered driver for app id {request.id}" ) self._logger.info(msg) - return erdos_scheduler_pb2.DeregisterDriverResponse( + return erdos_scheduler_pb2.RegisterDriverResponse( success=True, message=msg, + worker_id=self.__get_worker_id(), ) async def DeregisterDriver(self, request, context): @@ -482,13 +484,13 @@ async def NotifyTaskCompletion(self, request, context): async def _tick_simulator(self): while True: - if self._simulator is not None: - stime = self.__stime() - self._logger.debug(f"[{stime}] Simulator tick") - with self._lock: + with self._lock: + if self._simulator is not None: + stime = self.__stime() + # self._logger.debug(f"[{stime}] Simulator tick") self._simulator.tick(until=stime) - else: - print("Simulator instance is None") + # else: + # print("Simulator instance is None") await asyncio.sleep(1) def __stime(self) -> EventTime: From ab56afbd4cf1896049798650d48d6d7e33e2ec0d Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 21 Nov 2024 18:57:51 -0500 Subject: [PATCH 057/128] Add override_worker_cpu_count flag --- rpc/service.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 08964f75..33b1991d 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -50,6 +50,13 @@ 10, "The initial number of executors that are requested by each Spark application.", ) +flags.DEFINE_bool( + "override_worker_cpu_count", + False, + "If True, worker CPU count will be set to 640 (Cloudlab 20-node cluster CPU count). " + "This allows us to scale up spark experiments without actually deploying a large " + "spark cluster.", +) class DataLoader(Enum): @@ -340,9 +347,9 @@ async def RegisterWorker(self, request, context): cpu_resource = Resource(name="Slot") worker_resources = Resources( resource_vector={ - # TODO(elton): handle override worker cpu count? - cpu_resource: request.cores, - }, + cpu_resource: request.cores if not FLAGS.override_worker_cpu_count + else 640 + }, _logger=self._logger, ) worker = Worker( From b6fbab219892c668a1c1cccadb5bea319f519486 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 21 Nov 2024 23:54:22 -0500 Subject: [PATCH 058/128] More documentation: tpch-spark and spark-mirror related --- rpc/spark_erdos_setup.md | 67 +++++++++++++++++++++++++++++++++------- 1 file changed, 55 insertions(+), 12 deletions(-) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index e44cc1ae..c77d2b31 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -9,8 +9,7 @@ This README provides step-by-step instructions to set up the environment, compil --- -## Step 0: Create Conda Environment - +## Step 0A: Create Conda Environment ```bash conda create -n python=3.10 ``` @@ -25,6 +24,27 @@ conda activate conda install -c conda-forge openjdk=17.0.9 ``` +## Step 0B: Setup TPCH (dataset, jar) workload +Build the dataset +```bash +cd /path/to/tpch-spark/dbgen + +make + +./dbgen +``` + +Running `./dbgen` above creates a dataset of scale factor `s` of `1` (default) i.e. 1GB. + +> NOTE: Had updated the scala version to 2.13.0 in tpch.sbt + +Next, we build the target for `tpch-spark`: +```bash +sbt package +``` + +> NOTE: In case of errors in building the target, check `openjdk` version. It should be `17` and not `21`. + ## Step 1: Setup `spark-mirror` Clone the repository with submodules @@ -39,7 +59,7 @@ Verify or set current branch `erdos-spark-integration` Verify or set `SPARK_HOME` to point to the correct location of `spark-mirror`. ### Verify env variable `JAVA_HOME` -NOTE: `JAVA_HOME` should automatically get set to `/path/to/anaconda3/envs//lib/jvm` +> NOTE: `JAVA_HOME` should automatically get set to `/path/to/anaconda3/envs//lib/jvm` ### For first time compilation (entire package) ```bash @@ -70,10 +90,13 @@ This jar interferes with gRPC which requires a `guava-31` jar. To fix: - Run `./sbin/patch-erdos.sh` - Verify `guava-31.0.1-jre.jar` exists under `assembly/target/scala-2.13/jars/` - +### Update `PATH` with spark bin files +```bash +export PATH=$PATH:/path/to/spark_mirror/bin +``` ## Step 2: Compile ERDOS -NOTE: The `erdos-scheduling-simulator` in Step 2 refers to the seperately cloned repository. It is not the `erdos-scheduling-simulator` submodule within +> NOTE: The `erdos-scheduling-simulator` in Step 2 refers to the seperately cloned repository. It is not the `erdos-scheduling-simulator` submodule within the `spark-mirror` repository. ### Clone repo @@ -106,7 +129,7 @@ make -j install ``` ### Test that simulator works with `simple_av_workload` -NOTE: Might need to create `experiments` sub-directory if it doesnt already exist +> NOTE: Might need to create `experiments` sub-directory if it doesnt already exist ```bash python3 main.py --flagfile=configs/simple_av_workload.conf > experiments/simple_av_workload_test.output ``` @@ -114,7 +137,7 @@ The TaskGraph should complete and meet its deadline. ## Step 3: Spark-Erdos service functionality test -NOTE: As in step 2, the `erdos-scheduling-simulator` here also refers to the seperately cloned repository. +> NOTE: As in step 2, the `erdos-scheduling-simulator` here also refers to the seperately cloned repository. From the base directory: @@ -134,7 +157,7 @@ python -m rpc.service ``` ### Run local tests for the erdos-spark service -Note: Verify that `pytest` is installed in the ``. Else first do `pip install pytest`. Once installed, run the tests using: +> NOTE: Verify that `pytest` is installed in the ``. Else first do `pip install pytest`. Once installed, run the tests using: ```bash pytest tests/test_service.py ``` @@ -169,11 +192,31 @@ Also, verify that environment variable `SPARK_HOME` is set correctly to point to At this point, the spark framework should be registered with the erdos-service. ### Viewing spark cluster status -TBD +Start a ssh tunnel to the node hosting the spark cluster and access port `18080` using the command: +```bash +ssh -L 18080::18080 @ +``` -### Shutdown cluster +Once this command succeeds, you can view the History Server on your laptop's browser at URL: `localhost:18080` -* To stop all spark services after the experiment concludes +> NOTE: Same process needs to be repeated to view Master and Worker UIs. They run on ports `8080` and `8081` respectively. + +### Submitting a test spark application +To be submitted from within the `tpch-spark` repo: +```bash +/path/to/spark_mirror/bin/spark-submit --deploy-mode cluster --master spark://:7077 --conf 'spark.port.maxRetries=132' --conf 'spark.eventLog.enabled=true' --conf 'spark.eventLog.dir=/path/to/event_log' --conf 'spark.sql.adaptive.enabled=false' --conf 'spark.sql.adaptive.coalescePartitions.enabled=false' --conf 'spark.sql.autoBroadcastJoinThreshold=-1' --conf 'spark.sql.shuffle.partitions=1' --conf 'spark.sql.files.minPartitionNum=1' --conf 'spark.sql.files.maxPartitionNum=1' --conf 'spark.app.deadline=120' --class 'main.scala.TpchQuery' target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar "4" "50" "50" +``` + +The above job submission is parameterized by `(DEADLINE, QUERY_NUM, DATASET_SIZE, MAX_CORES)`. An example input value for this tuple is +`(120, 4, 50, 50)`. +> Refer to `launch_expt_script.py` in `tpch-spark` for more details on eligible values for these parameters and how they are used. + +Once submitted, review the application's runtime status on the Spark Web UI. + +### Shutdown cluster +* To stop the master and worker(s) after the experiment concludes, run: ```bash ./sbin/stop-all.sh -``` \ No newline at end of file +``` + +> NOTE: This command does not terminate the History Server process. \ No newline at end of file From 977f36a1244605c38f13d8107681669387dbd6b5 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Fri, 22 Nov 2024 12:17:19 -0500 Subject: [PATCH 059/128] add delay in test_service for register env ready --- tests/test_service.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_service.py b/tests/test_service.py index 80ba5185..cf4a977c 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -83,6 +83,9 @@ def test_service(): ) and response.num_executors == 10 ) + + # Introduce a 2s delay in getting the env ready + time.sleep(2) # Mark the environment as ready request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( From d793b25a8cd86002d8ae0e643de2da283238b963 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Fri, 22 Nov 2024 14:49:40 -0500 Subject: [PATCH 060/128] doc update for tpch spark --- rpc/spark_erdos_setup.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index c77d2b31..0a6775fc 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -36,7 +36,7 @@ make Running `./dbgen` above creates a dataset of scale factor `s` of `1` (default) i.e. 1GB. -> NOTE: Had updated the scala version to 2.13.0 in tpch.sbt +> NOTE: Had updated the scala version to 2.13.0 in tpch.sbt. The sbt version used was `1.9.7`. Next, we build the target for `tpch-spark`: ```bash @@ -211,6 +211,9 @@ The above job submission is parameterized by `(DEADLINE, QUERY_NUM, DATASET_SIZE `(120, 4, 50, 50)`. > Refer to `launch_expt_script.py` in `tpch-spark` for more details on eligible values for these parameters and how they are used. +> NOTE: By default, env variable `TPCH_INPUT_DATA_DIR` will look for `dbgen` inside the current working directory. While it works for `spark-submit` +> issued from inside the `tpch-spark` repository, it needs to be explicitly set otherwise. + Once submitted, review the application's runtime status on the Spark Web UI. ### Shutdown cluster From 6264772644928eb45b5a0a7603b87df95d1bd29a Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Fri, 22 Nov 2024 14:57:59 -0500 Subject: [PATCH 061/128] Add test to invoke getPlacements before task registration --- tests/test_service.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_service.py b/tests/test_service.py index cf4a977c..27c49814 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -60,6 +60,18 @@ def test_service(): r"Failed to load TPCH query 4. Exception: Structure of dependencies provided for query number 4 does not match that of canonical dependencies", response.message, ) + + # Try to fetch placements for an unregistered task graph + # Get placements for the task, should be empty + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-0", + ) + response = stub.GetPlacements(request) + assert not response.success and re.search( + r"Task graph with id \'task-graph-0\' is not registered or does not exist", + response.message, + ) # Register a correct TaskGraph request = erdos_scheduler_pb2.RegisterTaskGraphRequest( From bf658fe91efc162a8ab88b9f598e2278c7a0027e Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Fri, 22 Nov 2024 17:15:13 -0500 Subject: [PATCH 062/128] create task graph after environment is ready --- rpc/service.py | 128 ++++++++++++++++++++++++++++++------------------- 1 file changed, 79 insertions(+), 49 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 33b1991d..ce2aa220 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -3,10 +3,10 @@ import time import asyncio from concurrent import futures -from collections import namedtuple from urllib.parse import urlparse -from typing import Optional +from typing import Optional, Dict from enum import Enum +from dataclasses import dataclass # TODO: refactor out the need to import main to get common flags import main @@ -74,7 +74,25 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: return self._workload -RegisteredTaskGraph = namedtuple("RegisteredTaskGraph", ["graph", "stage_id_mapping"]) +# TODO(elton): rename to RegisteredApplication +# TODO(elton): write documentation on how to use + + +@dataclass +class RegisteredTaskGraph: + gen: any # TODO(elton): proper type + task_graph: TaskGraph = None + stage_id_mapping: any = None # TODO(elton): proper type + last_gen: any = None # TODO(elton): proper type + + def __init__(self, gen): + self.gen = gen + + def generate_task_graph(self, release_time): + task_graph, stage_id_mapping = self.gen(release_time) + self.task_graph = task_graph + self.stage_id_mapping = stage_id_mapping + self.last_gen = release_time class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): @@ -84,7 +102,7 @@ def __init__(self) -> None: log_dir=FLAGS.log_dir, log_file=FLAGS.log_file_name, log_level=FLAGS.log_level, - fmt='[%(asctime)s] {%(funcName)s:%(lineno)d} - %(message)s', + fmt="[%(asctime)s] {%(funcName)s:%(lineno)d} - %(message)s", ) self._csv_logger = setup_csv_logging( name=__name__, @@ -185,10 +203,8 @@ async def DeregisterFramework(self, request, context): async def RegisterDriver(self, request, context): stime = self.__stime() - - msg = ( - f"[{stime}] Successfully registered driver for app id {request.id}" - ) + + msg = f"[{stime}] Successfully registered driver for app id {request.id}" self._logger.info(msg) return erdos_scheduler_pb2.RegisterDriverResponse( success=True, @@ -198,7 +214,7 @@ async def RegisterDriver(self, request, context): async def DeregisterDriver(self, request, context): stime = self.__stime() - + if request.id not in self._registered_task_graphs: msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" self._logger.error(msg) @@ -261,34 +277,37 @@ async def RegisterTaskGraph(self, request, context): } ) - # Construct the task graph - try: - task_graph, stage_id_mapping = self._data_loaders[ - DataLoader.TPCH - ].make_task_graph( - id=request.id, - query_num=query_num, - release_time=stime, - dependencies=dependencies, - dataset_size=dataset_size, - max_executors_per_job=max_executors_per_job, - runtime_unit=EventTime.Unit.S, - ) - except Exception as e: - msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, message=msg, num_executors=0 - ) + def gen(release_time): + # Construct the task graph + try: + task_graph, stage_id_mapping = self._data_loaders[ + DataLoader.TPCH + ].make_task_graph( + id=request.id, + query_num=query_num, + release_time=release_time, + dependencies=dependencies, + dataset_size=dataset_size, + max_executors_per_job=max_executors_per_job, + runtime_unit=EventTime.Unit.S, + ) + except Exception as e: + msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + + return task_graph, stage_id_mapping + else: msg = f"[{stime}] The service only supports TPCH queries" return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 ) - self._registered_task_graphs[request.id] = RegisteredTaskGraph( - task_graph, stage_id_mapping - ) - msg = f"[{stime}] Registered task graph '{task_graph.name}' successfully" + self._registered_task_graphs[request.id] = RegisteredTaskGraph(gen=gen) + + msg = f"[{stime}] Registered task graph '{request.id}' successfully" self._logger.info(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, @@ -307,9 +326,12 @@ async def RegisterEnvironmentReady(self, request, context): message=msg, ) - task_graph = self._registered_task_graphs[request.id].graph + r = self._registered_task_graphs[request.id] + + # Generate the task graph now + r.generate_task_graph(stime) - self._workload_loader.add_task_graph(task_graph) + self._workload_loader.add_task_graph(r.task_graph) update_workload_event = Event( event_type=EventType.UPDATE_WORKLOAD, @@ -324,7 +346,7 @@ async def RegisterEnvironmentReady(self, request, context): self._simulator._event_queue.add_event(update_workload_event) self._simulator._event_queue.add_event(scheduler_start_event) - msg = f"[{stime}] Successfully marked environment as ready for task graph '{task_graph.name}'" + msg = f"[{stime}] Successfully marked environment as ready for task graph '{r.task_graph.name}'" self._logger.info(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=True, @@ -347,9 +369,10 @@ async def RegisterWorker(self, request, context): cpu_resource = Resource(name="Slot") worker_resources = Resources( resource_vector={ - cpu_resource: request.cores if not FLAGS.override_worker_cpu_count - else 640 - }, + # TODO(elton): handle override worker cpu count? + # cpu_resource: request.cores, + cpu_resource: 640, + }, _logger=self._logger, ) worker = Worker( @@ -381,11 +404,20 @@ async def GetPlacements(self, request, context): message=msg, ) - task_graph, stage_id_mapping = self._registered_task_graphs[request.id] + r = self._registered_task_graphs[request.id] + + if r.task_graph is None: + msg = f"[{stime}] Task graph '{request.id}' is not ready" + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=msg, + placements=[], + ) # Check if the task graph is active - if task_graph.is_complete(): - msg = f"[{stime}] Task graph '{task_graph.name}' is complete. No more placements to provide." + if r.task_graph.is_complete(): + msg = f"[{stime}] Task graph '{r.task_graph.name}' is complete. No more placements to provide." self._logger.error(msg) return erdos_scheduler_pb2.GetPlacementsResponse( success=False, @@ -394,18 +426,18 @@ async def GetPlacements(self, request, context): with self._lock: sim_placements = self._simulator.get_current_placements_for_task_graph( - task_graph.name + r.task_graph.name ) self._logger.info( - f"Received the following placements for '{task_graph.name}': {sim_placements}" + f"Received the following placements for '{r.task_graph.name}': {sim_placements}" ) # Construct response. Notably, we apply stage-id mapping placements = [] for placement in sim_placements: worker_id = self.__get_worker_id() - task_id = stage_id_mapping[placement.task.name] + task_id = r.stage_id_mapping[placement.task.name] cores = sum(x for _, x in placement.execution_strategy.resources.resources) if placement.placement_type not in ( @@ -443,12 +475,10 @@ async def NotifyTaskCompletion(self, request, context): message=msg, ) - task_graph, stage_id_mapping = self._registered_task_graphs[ - request.application_id - ] - task = task_graph.get_task(stage_id_mapping[request.task_id]) + r = self._registered_task_graphs[request.application_id] + task = r.task_graph.get_task(r.stage_id_mapping[request.task_id]) if task is None: - msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{task_graph.name}'" + msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{r.task_graph.name}'" self._logger.error(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, @@ -482,7 +512,7 @@ async def NotifyTaskCompletion(self, request, context): self._simulator._event_queue.add_event(task_finished_event) self._simulator._event_queue.add_event(scheduler_start_event) - msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{task_graph.name}'" + msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{r.task_graph.name}'" self._logger.info(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=True, From 7873e73f567aa1f17b6a456741557544e558f3aa Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Fri, 22 Nov 2024 17:17:56 -0500 Subject: [PATCH 063/128] update test --- tests/test_service.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/test_service.py b/tests/test_service.py index 27c49814..40eee8a8 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -46,21 +46,7 @@ def test_service(): r"Registered worker \(id=1234, name=test_worker\)", response.message ) - # Register an incorrect TaskGraph - request = erdos_scheduler_pb2.RegisterTaskGraphRequest( - id="task-graph", - name="TPCH Query 4 50 50", - timestamp=1234567890, - dependencies=[ - {"key": {"id": 0, "name": "stage 0"}, "children_ids": [1, 2]}, - ], - ) - response = stub.RegisterTaskGraph(request) - assert not response.success and re.search( - r"Failed to load TPCH query 4. Exception: Structure of dependencies provided for query number 4 does not match that of canonical dependencies", - response.message, - ) - + # Try to fetch placements for an unregistered task graph # Get placements for the task, should be empty request = erdos_scheduler_pb2.GetPlacementsRequest( @@ -73,6 +59,22 @@ def test_service(): response.message, ) + # TODO: move to environment ready + # Register an incorrect TaskGraph + # request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + # id="task-graph", + # name="TPCH Query 4 50 50", + # timestamp=1234567890, + # dependencies=[ + # {"key": {"id": 0, "name": "stage 0"}, "children_ids": [1, 2]}, + # ], + # ) + # response = stub.RegisterTaskGraph(request) + # assert not response.success and re.search( + # r"Failed to load TPCH query 4. Exception: Structure of dependencies provided for query number 4 does not match that of canonical dependencies", + # response.message, + # ) + # Register a correct TaskGraph request = erdos_scheduler_pb2.RegisterTaskGraphRequest( id="task-graph-0", @@ -90,7 +92,7 @@ def test_service(): assert ( response.success and re.search( - r"Registered task graph 'Q4\[task-graph-0\]@1' successfully", + r"Registered task graph 'task-graph-0' successfully", response.message, ) and response.num_executors == 10 From e76c3fc2c4a3c557a5f7544924eb9c194f832b20 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Sat, 23 Nov 2024 12:48:22 -0500 Subject: [PATCH 064/128] Separate out internal state mgmt for driver and application --- rpc/service.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index ce2aa220..30801ccd 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -130,7 +130,9 @@ def __init__(self) -> None: _flags=FLAGS, ) - self._registered_task_graphs = {} + # TODO: Items in _registered_task_graphs are never deleted right now, needs to be handled. + self._registered_task_graphs = {} + self._registered_app_drivers = {} # Spark driver id differs from taskgraph name (application id) self._lock = threading.Lock() super().__init__() @@ -203,8 +205,21 @@ async def DeregisterFramework(self, request, context): async def RegisterDriver(self, request, context): stime = self.__stime() - - msg = f"[{stime}] Successfully registered driver for app id {request.id}" + + if request.id in self._registered_app_drivers: + msg = f"[{stime}] Driver with id '{request.id}' is already registered" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message=msg, + worker_id=self.__get_worker_id(), + ) + + # TODO: Update the registered_app_drivers to map the driver id to + # application id once the taskgraph is registered. + self._registered_app_drivers[request.id] = None + + msg = f"[{stime}] Successfully registered driver {request.id} for an application." self._logger.info(msg) return erdos_scheduler_pb2.RegisterDriverResponse( success=True, @@ -215,18 +230,20 @@ async def RegisterDriver(self, request, context): async def DeregisterDriver(self, request, context): stime = self.__stime() - if request.id not in self._registered_task_graphs: - msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" + if request.id not in self._registered_app_drivers: + msg = f"[{stime}] Driver id '{request.id}' is not registered or does not exist" self._logger.error(msg) return erdos_scheduler_pb2.DeregisterDriverResponse( success=False, message=msg, ) + + # TODO: Dummy mapping from driver to task graph (application), so task_graph_name is None. + # Deletion of taskgraph from registered_task_graphs and driver from registered_app_drivers should be done carefully. + task_graph_name = self._registered_app_drivers[request.id] + del self._registered_app_drivers[request.id] - task_graph, _ = self._registered_task_graphs[request.id] - del self._registered_task_graphs[request.id] - - msg = f"[{stime}] Successfully de-registered driver for task graph {task_graph.name}" + msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}" self._logger.info(msg) return erdos_scheduler_pb2.DeregisterDriverResponse( success=True, From 5891dc3a05ba91144a5306e52020d9b5fff8b25f Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Sat, 23 Nov 2024 13:36:09 -0500 Subject: [PATCH 065/128] re-add impl for override_worker_cpu_count --- rpc/service.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 30801ccd..2addc894 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -386,9 +386,8 @@ async def RegisterWorker(self, request, context): cpu_resource = Resource(name="Slot") worker_resources = Resources( resource_vector={ - # TODO(elton): handle override worker cpu count? - # cpu_resource: request.cores, - cpu_resource: 640, + cpu_resource: request.cores if not FLAGS.override_worker_cpu_count + else 640 }, _logger=self._logger, ) From 25f3acdde838acc3bd1ef468bcb3ad6873e726f9 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Sat, 23 Nov 2024 18:20:28 -0500 Subject: [PATCH 066/128] fix: correct enqueue of task_finished and sched_start in notifyTaskCompletion --- rpc/service.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 2addc894..ba7cd03d 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -362,6 +362,8 @@ async def RegisterEnvironmentReady(self, request, context): with self._lock: self._simulator._event_queue.add_event(update_workload_event) self._simulator._event_queue.add_event(scheduler_start_event) + self._logger.info(f"[{stime}] Adding event {update_workload_event} to the simulator's event queue") + self._logger.info(f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue") msg = f"[{stime}] Successfully marked environment as ready for task graph '{r.task_graph.name}'" self._logger.info(msg) @@ -513,20 +515,25 @@ async def NotifyTaskCompletion(self, request, context): actual_task_completion_time = ( task.start_time + task.slowest_execution_strategy.runtime ) - + + # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is + # inaccurate for task completion notifications that occur past that time. Thus, a max of the current and actual completion time + # is taken to ensure that the task is marked completed at the correct time. task_finished_event = Event( event_type=EventType.TASK_FINISHED, - time=actual_task_completion_time, + time=max(actual_task_completion_time, stime), task=task, ) scheduler_start_event = Event( event_type=EventType.SCHEDULER_START, - time=actual_task_completion_time.to(EventTime.Unit.US), + time=max(actual_task_completion_time.to(EventTime.Unit.US), stime.to(EventTime.Unit.US)), ) with self._lock: self._simulator._event_queue.add_event(task_finished_event) self._simulator._event_queue.add_event(scheduler_start_event) + self._logger.info(f"[{stime}] Adding event {task_finished_event} to the simulator's event queue") + self._logger.info(f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue") msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{r.task_graph.name}'" self._logger.info(msg) From 7ad75ec73f607b1f16bc9d4ec5d3aeb4f686b948 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Sat, 23 Nov 2024 18:22:13 -0500 Subject: [PATCH 067/128] enable task cancellations to be sent back to backend --- rpc/service.py | 7 +++++-- simulator.py | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index ba7cd03d..b01072b8 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -454,9 +454,9 @@ async def GetPlacements(self, request, context): # Construct response. Notably, we apply stage-id mapping placements = [] for placement in sim_placements: - worker_id = self.__get_worker_id() + worker_id = self.__get_worker_id() if placement.placement_type == Placement.PlacementType.PLACE_TASK else "None" task_id = r.stage_id_mapping[placement.task.name] - cores = sum(x for _, x in placement.execution_strategy.resources.resources) + cores = sum(x for _, x in placement.execution_strategy.resources.resources) if placement.placement_type == Placement.PlacementType.PLACE_TASK else 0 if placement.placement_type not in ( Placement.PlacementType.PLACE_TASK, @@ -474,6 +474,9 @@ async def GetPlacements(self, request, context): == Placement.PlacementType.CANCEL_TASK, } ) + self._logger.info( + f"Sending placements for '{r.task_graph.name}': {placements}" + ) return erdos_scheduler_pb2.GetPlacementsResponse( success=True, diff --git a/simulator.py b/simulator.py index 3077d86c..3712c94f 100644 --- a/simulator.py +++ b/simulator.py @@ -710,6 +710,9 @@ def __create_events_from_task_placement_skip( task=cancelled_task, ) ) + self._current_task_graph_placements[placement.task.task_graph][ + placement.task.id + ] = placement if task_graph.is_cancelled(): released_tasks_from_new_task_graph = ( From 83d63d8986faa9a362254806774b6c01096ccd45 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Sat, 23 Nov 2024 18:26:29 -0500 Subject: [PATCH 068/128] update test script to include cancel_task scenario (needs to be sped up) --- tests/test_service.py | 88 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) diff --git a/tests/test_service.py b/tests/test_service.py index 40eee8a8..741209dc 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -75,7 +75,7 @@ def test_service(): # response.message, # ) - # Register a correct TaskGraph + # Register the first (correct) TaskGraph, it will be able to run request = erdos_scheduler_pb2.RegisterTaskGraphRequest( id="task-graph-0", name="TPCH Query 4 50 50", @@ -181,6 +181,92 @@ def test_service(): ) actual_task_ids.add(placement.task_id) assert actual_task_ids == {2} + + # Register the second (correct) TaskGraph, wont be able to run due to inadequate resources + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph-1", + name="TPCH Query 4 50 200", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]}, + {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]}, + {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]}, + {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]}, + {"key": {"id": 4, "name": "stage 4"}, "children_ids": []}, + ], + ) + response = stub.RegisterTaskGraph(request) + assert ( + response.success + and re.search( + r"Registered task graph 'task-graph-1' successfully", + response.message, + ) + and response.num_executors == 10 + ) + + # Introduce a 2s delay in getting the env ready + time.sleep(2) + + # Mark the environment as ready + request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( + id="task-graph-1", + num_executors=10, + timestamp=1234567890, + ) + response = stub.RegisterEnvironmentReady(request) + assert response.success and re.search( + r"Successfully marked environment as ready for task graph 'Q4\[task-graph-1\]@1'", + response.message, + ) + + # Wait for 10s to get the placements for the second task graph + time.sleep(10) + + # Get placements for the task, none should be placed since worker has inadequate resources + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-1", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "1234" and placement.application_id == "task-graph-1" + ) + actual_task_ids.add(placement.task_id) + assert len(actual_task_ids) == 0 + + # Wait for 100 more seconds and request placements again + time.sleep(100) + + # Notify task completion for task 2 in task graph 0 to trigger scheduler run again + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", + task_id=2, + timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + # Wait for 2 seconds to allow scheduler to process task completion and run scheduler + time.sleep(2) + + # Get placements for the task, entire taskgraph should be cancelled + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-1", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "None" and placement.application_id == "task-graph-1" and placement.cancelled == True + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {0, 1} # Deregister framework request = erdos_scheduler_pb2.DeregisterFrameworkRequest( From 66ec393e3f0a72a816e46aabc8bad46314ed869d Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Sun, 24 Nov 2024 23:32:15 -0500 Subject: [PATCH 069/128] allow service to use different schedulers based on args --- rpc/service.py | 61 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index b01072b8..da9a5bfb 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -10,7 +10,6 @@ # TODO: refactor out the need to import main to get common flags import main -from schedulers import EDFScheduler from simulator import Simulator, Event, EventTime, EventType from workers import Worker, WorkerPool, WorkerPools from workload import Resource, Resources, Workload, TaskGraph, TaskState, Placement @@ -124,11 +123,59 @@ def __init__(self) -> None: self._simulator = None self._workload_loader = None - self._scheduler = EDFScheduler( - runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), - enforce_deadlines=FLAGS.enforce_deadlines, - _flags=FLAGS, - ) + # Instantiate the scheduler based on the given flag. + self._scheduler = None + if FLAGS.scheduler == "FIFO": + from schedulers import FIFOScheduler + + self._scheduler = FIFOScheduler( + preemptive=FLAGS.preemption, + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, # TODO: (DG) Check why this isnt passed in the simulator + _flags=FLAGS, + ) + elif FLAGS.scheduler == "EDF": + from schedulers import EDFScheduler + + self._scheduler = EDFScheduler( + preemptive=FLAGS.preemption, + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) + elif FLAGS.scheduler == "TetriSched": + from schedulers import TetriSchedScheduler + + finer_discretization = FLAGS.finer_discretization_at_prev_solution + self._scheduler = TetriSchedScheduler( + preemptive=FLAGS.preemption, + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + lookahead=EventTime(FLAGS.scheduler_lookahead, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, + retract_schedules=FLAGS.retract_schedules, + release_taskgraphs=FLAGS.release_taskgraphs, + goal=FLAGS.ilp_goal, + time_discretization=EventTime( + FLAGS.scheduler_time_discretization, EventTime.Unit.US + ), + plan_ahead=EventTime(FLAGS.scheduler_plan_ahead, EventTime.Unit.US), + log_to_file=FLAGS.scheduler_log_to_file, + adaptive_discretization=FLAGS.scheduler_adaptive_discretization, + _flags=FLAGS, + max_time_discretization=EventTime( + FLAGS.scheduler_max_time_discretization, EventTime.Unit.US + ), + max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold, + finer_discretization_at_prev_solution=finer_discretization, + finer_discretization_window=EventTime( + FLAGS.finer_discretization_window, EventTime.Unit.US + ), + plan_ahead_no_consideration_gap=EventTime( + FLAGS.scheduler_plan_ahead_no_consideration_gap, EventTime.Unit.US + ), + ) + else: + raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") # TODO: Items in _registered_task_graphs are never deleted right now, needs to be handled. self._registered_task_graphs = {} @@ -550,7 +597,7 @@ async def _tick_simulator(self): with self._lock: if self._simulator is not None: stime = self.__stime() - # self._logger.debug(f"[{stime}] Simulator tick") + self._logger.debug(f"[{stime}] Simulator tick") self._simulator.tick(until=stime) # else: # print("Simulator instance is None") From 400ec449e854393e1e61966493b60aa2313c47a5 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 25 Nov 2024 08:14:43 -0500 Subject: [PATCH 070/128] format service --- rpc/service.py | 68 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index da9a5bfb..7198409c 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -131,7 +131,7 @@ def __init__(self) -> None: self._scheduler = FIFOScheduler( preemptive=FLAGS.preemption, runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), - enforce_deadlines=FLAGS.enforce_deadlines, # TODO: (DG) Check why this isnt passed in the simulator + enforce_deadlines=FLAGS.enforce_deadlines, # TODO: (DG) Check why this isnt passed in the simulator _flags=FLAGS, ) elif FLAGS.scheduler == "EDF": @@ -178,8 +178,10 @@ def __init__(self) -> None: raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") # TODO: Items in _registered_task_graphs are never deleted right now, needs to be handled. - self._registered_task_graphs = {} - self._registered_app_drivers = {} # Spark driver id differs from taskgraph name (application id) + self._registered_task_graphs = {} + self._registered_app_drivers = ( + {} + ) # Spark driver id differs from taskgraph name (application id) self._lock = threading.Lock() super().__init__() @@ -252,7 +254,7 @@ async def DeregisterFramework(self, request, context): async def RegisterDriver(self, request, context): stime = self.__stime() - + if request.id in self._registered_app_drivers: msg = f"[{stime}] Driver with id '{request.id}' is already registered" self._logger.error(msg) @@ -261,12 +263,14 @@ async def RegisterDriver(self, request, context): message=msg, worker_id=self.__get_worker_id(), ) - - # TODO: Update the registered_app_drivers to map the driver id to + + # TODO: Update the registered_app_drivers to map the driver id to # application id once the taskgraph is registered. self._registered_app_drivers[request.id] = None - - msg = f"[{stime}] Successfully registered driver {request.id} for an application." + + msg = ( + f"[{stime}] Successfully registered driver {request.id} for an application." + ) self._logger.info(msg) return erdos_scheduler_pb2.RegisterDriverResponse( success=True, @@ -284,7 +288,7 @@ async def DeregisterDriver(self, request, context): success=False, message=msg, ) - + # TODO: Dummy mapping from driver to task graph (application), so task_graph_name is None. # Deletion of taskgraph from registered_task_graphs and driver from registered_app_drivers should be done carefully. task_graph_name = self._registered_app_drivers[request.id] @@ -409,8 +413,12 @@ async def RegisterEnvironmentReady(self, request, context): with self._lock: self._simulator._event_queue.add_event(update_workload_event) self._simulator._event_queue.add_event(scheduler_start_event) - self._logger.info(f"[{stime}] Adding event {update_workload_event} to the simulator's event queue") - self._logger.info(f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue") + self._logger.info( + f"[{stime}] Adding event {update_workload_event} to the simulator's event queue" + ) + self._logger.info( + f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" + ) msg = f"[{stime}] Successfully marked environment as ready for task graph '{r.task_graph.name}'" self._logger.info(msg) @@ -435,8 +443,9 @@ async def RegisterWorker(self, request, context): cpu_resource = Resource(name="Slot") worker_resources = Resources( resource_vector={ - cpu_resource: request.cores if not FLAGS.override_worker_cpu_count - else 640 + cpu_resource: ( + request.cores if not FLAGS.override_worker_cpu_count else 640 + ) }, _logger=self._logger, ) @@ -501,9 +510,17 @@ async def GetPlacements(self, request, context): # Construct response. Notably, we apply stage-id mapping placements = [] for placement in sim_placements: - worker_id = self.__get_worker_id() if placement.placement_type == Placement.PlacementType.PLACE_TASK else "None" + worker_id = ( + self.__get_worker_id() + if placement.placement_type == Placement.PlacementType.PLACE_TASK + else "None" + ) task_id = r.stage_id_mapping[placement.task.name] - cores = sum(x for _, x in placement.execution_strategy.resources.resources) if placement.placement_type == Placement.PlacementType.PLACE_TASK else 0 + cores = ( + sum(x for _, x in placement.execution_strategy.resources.resources) + if placement.placement_type == Placement.PlacementType.PLACE_TASK + else 0 + ) if placement.placement_type not in ( Placement.PlacementType.PLACE_TASK, @@ -521,9 +538,7 @@ async def GetPlacements(self, request, context): == Placement.PlacementType.CANCEL_TASK, } ) - self._logger.info( - f"Sending placements for '{r.task_graph.name}': {placements}" - ) + self._logger.info(f"Sending placements for '{r.task_graph.name}': {placements}") return erdos_scheduler_pb2.GetPlacementsResponse( success=True, @@ -565,8 +580,8 @@ async def NotifyTaskCompletion(self, request, context): actual_task_completion_time = ( task.start_time + task.slowest_execution_strategy.runtime ) - - # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is + + # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is # inaccurate for task completion notifications that occur past that time. Thus, a max of the current and actual completion time # is taken to ensure that the task is marked completed at the correct time. task_finished_event = Event( @@ -576,14 +591,21 @@ async def NotifyTaskCompletion(self, request, context): ) scheduler_start_event = Event( event_type=EventType.SCHEDULER_START, - time=max(actual_task_completion_time.to(EventTime.Unit.US), stime.to(EventTime.Unit.US)), + time=max( + actual_task_completion_time.to(EventTime.Unit.US), + stime.to(EventTime.Unit.US), + ), ) with self._lock: self._simulator._event_queue.add_event(task_finished_event) self._simulator._event_queue.add_event(scheduler_start_event) - self._logger.info(f"[{stime}] Adding event {task_finished_event} to the simulator's event queue") - self._logger.info(f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue") + self._logger.info( + f"[{stime}] Adding event {task_finished_event} to the simulator's event queue" + ) + self._logger.info( + f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" + ) msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{r.task_graph.name}'" self._logger.info(msg) From f69c0a9fff26a598d168f023c6a3ff179cd99596 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 25 Nov 2024 09:12:40 -0500 Subject: [PATCH 071/128] add enforce deadlines flag --- tests/test_service.py | 60 +++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/tests/test_service.py b/tests/test_service.py index 741209dc..8fb54911 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="module", autouse=True) def service(): - process = subprocess.Popen(["python", "-m", "rpc.service"]) + process = subprocess.Popen(["python", "-m", "rpc.service", "--enforce_deadlines"]) channel = grpc.insecure_channel("localhost:50051") try: grpc.channel_ready_future(channel).result(timeout=5) @@ -46,7 +46,6 @@ def test_service(): r"Registered worker \(id=1234, name=test_worker\)", response.message ) - # Try to fetch placements for an unregistered task graph # Get placements for the task, should be empty request = erdos_scheduler_pb2.GetPlacementsRequest( @@ -97,7 +96,7 @@ def test_service(): ) and response.num_executors == 10 ) - + # Introduce a 2s delay in getting the env ready time.sleep(2) @@ -129,44 +128,38 @@ def test_service(): ) actual_task_ids.add(placement.task_id) assert actual_task_ids == {0, 1} - + # Wait for 3 seconds and trigger notify task completion for tasks 0 and 1 time.sleep(3) - + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( - application_id="task-graph-0", - task_id=0, - timestamp=1234567890 + application_id="task-graph-0", task_id=0, timestamp=1234567890 ) response = stub.NotifyTaskCompletion(request) assert response.success - + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( - application_id="task-graph-0", - task_id=1, - timestamp=1234567890 + application_id="task-graph-0", task_id=1, timestamp=1234567890 ) response = stub.NotifyTaskCompletion(request) assert response.success - + # Wait for 20s to allow the service to execute task completion for fastest task time.sleep(20) - + # Attempt to incorrectly notify task completion for task 3, which hasnt started yet request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( - application_id="task-graph-0", - task_id=3, - timestamp=1234567890 + application_id="task-graph-0", task_id=3, timestamp=1234567890 ) response = stub.NotifyTaskCompletion(request) assert not response.success - + # Wait 2s to allow the service to process the incorrect task completion time.sleep(2) - + # Wait for 25s to allow the service to finish execution of task 0 time.sleep(25) - + # This will unlock task 2, which should now be returned as a placement request = erdos_scheduler_pb2.GetPlacementsRequest( timestamp=1234567890, @@ -181,7 +174,7 @@ def test_service(): ) actual_task_ids.add(placement.task_id) assert actual_task_ids == {2} - + # Register the second (correct) TaskGraph, wont be able to run due to inadequate resources request = erdos_scheduler_pb2.RegisterTaskGraphRequest( id="task-graph-1", @@ -204,10 +197,10 @@ def test_service(): ) and response.num_executors == 10 ) - + # Introduce a 2s delay in getting the env ready time.sleep(2) - + # Mark the environment as ready request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( id="task-graph-1", @@ -219,10 +212,10 @@ def test_service(): r"Successfully marked environment as ready for task graph 'Q4\[task-graph-1\]@1'", response.message, ) - + # Wait for 10s to get the placements for the second task graph time.sleep(10) - + # Get placements for the task, none should be placed since worker has inadequate resources request = erdos_scheduler_pb2.GetPlacementsRequest( timestamp=1234567890, @@ -237,33 +230,34 @@ def test_service(): ) actual_task_ids.add(placement.task_id) assert len(actual_task_ids) == 0 - + # Wait for 100 more seconds and request placements again time.sleep(100) - + # Notify task completion for task 2 in task graph 0 to trigger scheduler run again request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( - application_id="task-graph-0", - task_id=2, - timestamp=1234567890 + application_id="task-graph-0", task_id=2, timestamp=1234567890 ) response = stub.NotifyTaskCompletion(request) assert response.success - + # Wait for 2 seconds to allow scheduler to process task completion and run scheduler time.sleep(2) - + # Get placements for the task, entire taskgraph should be cancelled request = erdos_scheduler_pb2.GetPlacementsRequest( timestamp=1234567890, id="task-graph-1", ) response = stub.GetPlacements(request) + print(response) assert response.success actual_task_ids = set() for placement in response.placements: assert ( - placement.worker_id == "None" and placement.application_id == "task-graph-1" and placement.cancelled == True + placement.worker_id == "None" + and placement.application_id == "task-graph-1" + and placement.cancelled == True ) actual_task_ids.add(placement.task_id) assert actual_task_ids == {0, 1} From 9e8ab1e481fdaa1379a603d3e21fb77dafb3eda7 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 26 Nov 2024 11:53:12 -0500 Subject: [PATCH 072/128] step workers during a tick --- simulator.py | 77 ++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/simulator.py b/simulator.py index 3712c94f..bfbee064 100644 --- a/simulator.py +++ b/simulator.py @@ -484,32 +484,10 @@ def simulate(self) -> None: This loop requires the `Workload` to be populated with the `TaskGraph`s whose execution is to be simulated using the Scheduler. """ - self.__simulate_f(lambda _: True) - - def tick(self, until: EventTime) -> None: - """Tick the simulator until the specified time""" - self.__simulate_f(should_continue=lambda et: et <= until) - - def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: - """Helper function to run the simulator until a predicate is satisfied. - - The predicate (`should_continue`) receives the time of the next event - in the queue, using which it can use to decide whether or not to - simulate. - """ - # Run the simulator loop. - while True: - top = self._event_queue.peek() - if not top or not should_continue(top.time): - break - + def f(): time_until_next_event = self.__time_until_next_event() - - # If there are any running tasks, step through the execution of the - # Simulator until the closest remaining time. running_tasks = self._worker_pools.get_placed_tasks() - - if not self._orchestrated and len(running_tasks) > 0: + if len(running_tasks) > 0: # There are running tasks, figure out the minimum remaining # time across all the tasks. min_task_remaining_time = min( @@ -527,30 +505,53 @@ def __simulate_f(self, should_continue: Callable[[EventTime], bool]) -> None: # the next event in the queue, step all workers until the # completion of that task, otherwise, handle the next event. if min_task_remaining_time < time_until_next_event: - self.__step(step_size=min_task_remaining_time) + step_size = min_task_remaining_time else: - # NOTE: We step here so that all the Tasks that are going - # to finish as a result of this step have their TASK_FINISHED - # events processed first before any future placement occurs - # that is decided prior. - self.__step(step_size=time_until_next_event) - if self.__handle_event(self._event_queue.next()): - break + step_size = time_until_next_event else: - # Step until the next event is supposed to be executed. - self.__step(step_size=time_until_next_event) - if self.__handle_event(self._event_queue.next()): - break + step_size = time_until_next_event + return step_size + self.__simulate_f(should_step=f) + + def tick(self, until: EventTime) -> None: + """Tick the simulator until the specified time""" + def f(): + time_until_next_event = self.__time_until_next_event() + if time_until_next_event.is_invalid(): + if until == self._simulator_time: + return None + return until - self._simulator_time + elif (time_until_next_event + self._simulator_time) <= until: + return time_until_next_event + else: + return None + self.__simulate_f(should_step=f) + + def __simulate_f(self, should_step: Callable[[EventTime], bool]) -> None: + """TODO doc + """ + # Step the simulator loop. + while True: + step_size = should_step() + if not step_size: + break + self.__step(step_size=step_size) + if self._event_queue.peek() and self.__handle_event(self._event_queue.next()): + break def get_current_placements_for_task_graph( self, task_graph_name: str ) -> List[Placement]: if task_graph_name not in self._current_task_graph_placements: - raise ValueError(f"Task graph '{task_graph_name}' does not exist") + self._logger.warning(f"Cannot recognize task graph '{task_graph_name}'") + return [] return list(self._current_task_graph_placements[task_graph_name].values()) def __time_until_next_event(self) -> EventTime: - return self._event_queue.peek().time - self._simulator_time + if self._event_queue.peek(): + return self._event_queue.peek().time - self._simulator_time + else: + return EventTime.invalid() def __handle_scheduler_start(self, event: Event) -> None: """Handle the SCHEDULER_START event. The method invokes the scheduler, and adds From 84d89d20c7e6ef297991592a5844d97b28b2ab10 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 26 Nov 2024 13:56:59 -0500 Subject: [PATCH 073/128] update placement time only if it is in the past --- simulator.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/simulator.py b/simulator.py index bfbee064..e952c9e4 100644 --- a/simulator.py +++ b/simulator.py @@ -2087,10 +2087,17 @@ def __run_scheduler(self, event: Event) -> Event: f"Received no Placements object from the Scheduler at {event.time}.", ) - # Calculate the time at which the placements need to be applied. placement_time = event.time + placements.runtime + for placement in placements: - placement._placement_time = placement_time + # If the placement is in the past, update it to match + # `placement_time` + # This scenario happens when the `scheduler_runtime` is non-zero. + if placement._placement_time and placement._placement_time < placement_time: + self._logger.warning( + f"[{self._simulator_time}] Placement is in the past. Updating placement time from {placement._placement_time} to {placement_time}" + ) + placement._placement_time = placement_time # Save the placements until the placement time arrives. self._last_scheduler_placements = placements From bf301aa6a68960446375750c934a797fac805e5b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 26 Nov 2024 13:57:12 -0500 Subject: [PATCH 074/128] refactor file --- simulator.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/simulator.py b/simulator.py index e952c9e4..78598b6f 100644 --- a/simulator.py +++ b/simulator.py @@ -484,6 +484,7 @@ def simulate(self) -> None: This loop requires the `Workload` to be populated with the `TaskGraph`s whose execution is to be simulated using the Scheduler. """ + def f(): time_until_next_event = self.__time_until_next_event() running_tasks = self._worker_pools.get_placed_tasks() @@ -511,10 +512,12 @@ def f(): else: step_size = time_until_next_event return step_size + self.__simulate_f(should_step=f) def tick(self, until: EventTime) -> None: """Tick the simulator until the specified time""" + def f(): time_until_next_event = self.__time_until_next_event() if time_until_next_event.is_invalid(): @@ -525,18 +528,20 @@ def f(): return time_until_next_event else: return None + self.__simulate_f(should_step=f) def __simulate_f(self, should_step: Callable[[EventTime], bool]) -> None: - """TODO doc - """ + """TODO doc""" # Step the simulator loop. while True: step_size = should_step() if not step_size: break self.__step(step_size=step_size) - if self._event_queue.peek() and self.__handle_event(self._event_queue.next()): + if self._event_queue.peek() and self.__handle_event( + self._event_queue.next() + ): break def get_current_placements_for_task_graph( @@ -713,7 +718,7 @@ def __create_events_from_task_placement_skip( ) self._current_task_graph_placements[placement.task.task_graph][ placement.task.id - ] = placement + ] = placement if task_graph.is_cancelled(): released_tasks_from_new_task_graph = ( From 049954136f4cb68988748d69d8276de335a27308 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 26 Nov 2024 14:01:01 -0500 Subject: [PATCH 075/128] document refactored simulate_f --- simulator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/simulator.py b/simulator.py index 78598b6f..35ec94c7 100644 --- a/simulator.py +++ b/simulator.py @@ -531,9 +531,20 @@ def f(): self.__simulate_f(should_step=f) - def __simulate_f(self, should_step: Callable[[EventTime], bool]) -> None: - """TODO doc""" - # Step the simulator loop. + def __simulate_f(self, should_step: Callable[None, Optional[EventTime]]) -> None: + """Steps the simulator while a predicate is satisfied. + + This method continuously advances the simulation by calling the + provided `should_step` function, which determines the size of each + simulation step. The simulation continues until `should_step` returns + None, indicating that stepping should stop. + + Args: + should_step (Callable[[EventTime], bool]): + A predicate function that determines the next step size for the simulation. + - If the function returns an EventTime value, the simulator steps by that amount. + - If the function returns None, the simulation stops. + """ while True: step_size = should_step() if not step_size: From 953906931fc0902333d4b7e3e212e18e2be9e0f5 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 26 Nov 2024 16:07:55 -0500 Subject: [PATCH 076/128] handle case where task graph cannot be accommodated in worker pool --- data/tpch_loader.py | 20 ++++----- rpc/service.py | 104 ++++++++++++++++++++++++++++---------------- 2 files changed, 75 insertions(+), 49 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 20d590cf..91280ec6 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -58,11 +58,10 @@ def __init__( query_num = int(query["name"][1:]) self._graphs[query_num] = query["graph"] - def make_task_graph( + def make_job_graph( self, id: str, query_num: int, - release_time: EventTime, dependencies: Optional[List[Dict[str, Any]]] = None, profile_type: Optional[str] = None, dataset_size: Optional[int] = None, @@ -129,17 +128,11 @@ def make_task_graph( child_job = name_to_job[child] job_graph.add_child(job, child_job) - # Construct TaskGraph from JobGraph - task_graph = job_graph.get_next_task_graph( - start_time=release_time, - _flags=self._flags, - ) - self._logger.info( - f"Constructed TaskGraph for TPC-H query {query_name(query_num)}." + f"Constructed JobGraph for TPC-H query {query_name(query_num)}." ) - return task_graph, deps_mapping + return job_graph, deps_mapping def __make_work_profile( self, @@ -385,10 +378,13 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: return None for i, (q, t) in enumerate(to_release): - task_graph, _ = self._tpch_loader.make_task_graph( + job_graph, _ = self._tpch_loader.make_job_graph( id=str(i), query_num=q, - release_time=t, + ) + task_graph = job_graph.get_next_task_graph( + start_time=t, + _flags=self._flags, ) self._workload.add_task_graph(task_graph) diff --git a/rpc/service.py b/rpc/service.py index 7198409c..84351653 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -12,7 +12,15 @@ import main from simulator import Simulator, Event, EventTime, EventType from workers import Worker, WorkerPool, WorkerPools -from workload import Resource, Resources, Workload, TaskGraph, TaskState, Placement +from workload import ( + Resource, + Resources, + Workload, + TaskGraph, + TaskState, + Placement, + JobGraph, +) from data import BaseWorkloadLoader from data.tpch_loader import TpchLoader from utils import setup_logging, setup_csv_logging @@ -73,12 +81,10 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: return self._workload -# TODO(elton): rename to RegisteredApplication -# TODO(elton): write documentation on how to use - - @dataclass -class RegisteredTaskGraph: +class RegisteredApplication: + # TODO(elton): documentation + gen: any # TODO(elton): proper type task_graph: TaskGraph = None stage_id_mapping: any = None # TODO(elton): proper type @@ -177,8 +183,8 @@ def __init__(self) -> None: else: raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") - # TODO: Items in _registered_task_graphs are never deleted right now, needs to be handled. - self._registered_task_graphs = {} + # TODO: Items in _registered_applications are never deleted right now, needs to be handled. + self._registered_applications = {} self._registered_app_drivers = ( {} ) # Spark driver id differs from taskgraph name (application id) @@ -290,7 +296,7 @@ async def DeregisterDriver(self, request, context): ) # TODO: Dummy mapping from driver to task graph (application), so task_graph_name is None. - # Deletion of taskgraph from registered_task_graphs and driver from registered_app_drivers should be done carefully. + # Deletion of taskgraph from registered_applications and driver from registered_app_drivers should be done carefully. task_graph_name = self._registered_app_drivers[request.id] del self._registered_app_drivers[request.id] @@ -311,7 +317,7 @@ async def RegisterTaskGraph(self, request, context): success=False, message=msg, num_executors=0 ) - if request.id in self._registered_task_graphs: + if request.id in self._registered_applications: msg = f"[{stime}] The task graph (id={request.id}, name={request.name}) is already registered" self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( @@ -345,35 +351,47 @@ async def RegisterTaskGraph(self, request, context): } ) - def gen(release_time): - # Construct the task graph - try: - task_graph, stage_id_mapping = self._data_loaders[ - DataLoader.TPCH - ].make_task_graph( - id=request.id, - query_num=query_num, - release_time=release_time, - dependencies=dependencies, - dataset_size=dataset_size, - max_executors_per_job=max_executors_per_job, - runtime_unit=EventTime.Unit.S, - ) - except Exception as e: - msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, message=msg, num_executors=0 - ) + # Create a job graph + try: + job_graph, stage_id_mapping = self._data_loaders[ + DataLoader.TPCH + ].make_job_graph( + id=request.id, + query_num=query_num, + dependencies=dependencies, + dataset_size=dataset_size, + max_executors_per_job=max_executors_per_job, + runtime_unit=EventTime.Unit.S, + ) + except Exception as e: + msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + if not self.__can_accomodate_task_graph(job_graph): + msg = f"[{stime}] The worker Pool cannot accomodate the task graph '{request.id}'" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + + def gen(release_time): + task_graph = job_graph.get_next_task_graph( + start_time=release_time, + _flags=FLAGS, + ) return task_graph, stage_id_mapping else: msg = f"[{stime}] The service only supports TPCH queries" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 ) - self._registered_task_graphs[request.id] = RegisteredTaskGraph(gen=gen) + self._registered_applications[request.id] = RegisteredApplication(gen=gen) msg = f"[{stime}] Registered task graph '{request.id}' successfully" self._logger.info(msg) @@ -386,7 +404,7 @@ def gen(release_time): async def RegisterEnvironmentReady(self, request, context): stime = self.__stime() - if request.id not in self._registered_task_graphs: + if request.id not in self._registered_applications: msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" self._logger.error(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( @@ -394,7 +412,7 @@ async def RegisterEnvironmentReady(self, request, context): message=msg, ) - r = self._registered_task_graphs[request.id] + r = self._registered_applications[request.id] # Generate the task graph now r.generate_task_graph(stime) @@ -470,7 +488,7 @@ async def GetPlacements(self, request, context): stime = self.__stime() # Check if the task graph is registered - if request.id not in self._registered_task_graphs: + if request.id not in self._registered_applications: msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist" self._logger.error(msg) return erdos_scheduler_pb2.GetPlacementsResponse( @@ -478,7 +496,7 @@ async def GetPlacements(self, request, context): message=msg, ) - r = self._registered_task_graphs[request.id] + r = self._registered_applications[request.id] if r.task_graph is None: msg = f"[{stime}] Task graph '{request.id}' is not ready" @@ -550,7 +568,7 @@ async def NotifyTaskCompletion(self, request, context): stime = self.__stime() # Check if the task graph is registered - if request.application_id not in self._registered_task_graphs: + if request.application_id not in self._registered_applications: msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist" self._logger.error(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( @@ -558,7 +576,7 @@ async def NotifyTaskCompletion(self, request, context): message=msg, ) - r = self._registered_task_graphs[request.application_id] + r = self._registered_applications[request.application_id] task = r.task_graph.get_task(r.stage_id_mapping[request.task_id]) if task is None: msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{r.task_graph.name}'" @@ -643,10 +661,22 @@ def __get_worker_pool(self): # Simulator maintains only one worker pool, so this should be fine return next(iter(self._simulator._worker_pools.worker_pools)) + def __get_worker(self): + return self.__get_worker_pool().workers[0] + def __get_worker_id(self): # We return the name here because we register the worker id from # Spark as the name of the worker in the worker pool - return self.__get_worker_pool().workers[0].name + return self.__get_worker().name + + def __can_accomodate_task_graph(self, job_graph: JobGraph): + worker_resources = self.__get_worker().resources + for job in job_graph: + for strat in job.execution_strategies: + for resource, quantity in strat.resources.resources: + if worker_resources.get_total_quantity(resource) < quantity: + return False + return True async def serve(server): From 89ee06dfbc1b7ebdeeb4fb2059d58636ee58d4db Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 26 Nov 2024 22:32:24 -0500 Subject: [PATCH 077/128] Potpourri of bug fixes and improvements - Undo stepping of worker every tick; it lead to some errors pertaining to task eviction. Punt investigation for later - Fix bug associated with mapping task ids to spark stage ids - Add a flag to control how the minimum amount by which a placement can be pushed into the future --- main.py | 6 ++++++ rpc/service.py | 29 ++++++++++++++++++++++------- simulator.py | 28 +++++++++++++++++++--------- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/main.py b/main.py index b501c13f..0bc956dc 100644 --- a/main.py +++ b/main.py @@ -121,6 +121,12 @@ False, "Runs the simulator in orchestrated mode. Currently used by the ERDOS service.", ) +flags.DEFINE_integer( + "min_placement_push_duration", + 1, + "The duration (in µs) by which to push a task placement if it cannot be" + "placed on a worker at its original time", +) # Benchmark related flags. flags.DEFINE_integer( diff --git a/rpc/service.py b/rpc/service.py index 84351653..11a197a8 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -87,8 +87,10 @@ class RegisteredApplication: gen: any # TODO(elton): proper type task_graph: TaskGraph = None - stage_id_mapping: any = None # TODO(elton): proper type - last_gen: any = None # TODO(elton): proper type + + _forward: any = None # TODO(elton): proper type + _backward: any = None # TODO(elton): proper type + _last_gen: any = None # TODO(elton): proper type def __init__(self, gen): self.gen = gen @@ -96,8 +98,15 @@ def __init__(self, gen): def generate_task_graph(self, release_time): task_graph, stage_id_mapping = self.gen(release_time) self.task_graph = task_graph - self.stage_id_mapping = stage_id_mapping - self.last_gen = release_time + self._forward = stage_id_mapping + self._backward = {v: k for k, v in self._forward.items()} + self._last_gen = release_time + + def spark_task_id(self, task_id): + return self._backward[task_id] + + def canonical_task_id(self, task_id): + return self._forward[task_id] class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): @@ -218,6 +227,8 @@ async def RegisterFramework(self, request, context): # Enable orchestrated mode FLAGS.orchestrated = True + # Set minimum placement push duration to 1s + FLAGS.min_placement_push_duration = 1_000_000 self._simulator = Simulator( scheduler=self._scheduler, worker_pools=WorkerPools( @@ -528,12 +539,16 @@ async def GetPlacements(self, request, context): # Construct response. Notably, we apply stage-id mapping placements = [] for placement in sim_placements: + # Ignore virtual placements + if placement.task.state < TaskState.RELEASED: + continue + worker_id = ( self.__get_worker_id() if placement.placement_type == Placement.PlacementType.PLACE_TASK else "None" ) - task_id = r.stage_id_mapping[placement.task.name] + task_id = r.spark_task_id(placement.task.name) cores = ( sum(x for _, x in placement.execution_strategy.resources.resources) if placement.placement_type == Placement.PlacementType.PLACE_TASK @@ -577,7 +592,7 @@ async def NotifyTaskCompletion(self, request, context): ) r = self._registered_applications[request.application_id] - task = r.task_graph.get_task(r.stage_id_mapping[request.task_id]) + task = r.task_graph.get_task(r.canonical_task_id(request.task_id)) if task is None: msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{r.task_graph.name}'" self._logger.error(msg) @@ -587,7 +602,7 @@ async def NotifyTaskCompletion(self, request, context): ) if task.state != TaskState.RUNNING: - msg = f"[{stime}] Received task completion notification for task '{request.task_id}' but it is not running" + msg = f"[{stime}] Received task completion notification for task '{request.task_id}' (mapped to '{r.canonical_task_id(request.task_id)}') of '{r.task_graph.name}' but it is not running" self._logger.error(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, diff --git a/simulator.py b/simulator.py index 35ec94c7..9fc47137 100644 --- a/simulator.py +++ b/simulator.py @@ -382,6 +382,11 @@ def event_representation_filter(record): # Is the simulator orchestrated? self._orchestrated = _flags.orchestrated + # Minimum duration by which to push task placements + self._min_placement_push_duration = EventTime( + _flags.min_placement_push_duration, EventTime.Unit.US + ) + # Initialize the event queue. # To make the system continue working the loop, we add three events: # - SIMULATOR_START: A notional event start the simulator and log into the CSV. @@ -520,14 +525,14 @@ def tick(self, until: EventTime) -> None: def f(): time_until_next_event = self.__time_until_next_event() - if time_until_next_event.is_invalid(): - if until == self._simulator_time: - return None - return until - self._simulator_time - elif (time_until_next_event + self._simulator_time) <= until: + + if ( + not time_until_next_event.is_invalid() + and (time_until_next_event + self._simulator_time) <= until + ): return time_until_next_event - else: - return None + + return None self.__simulate_f(should_step=f) @@ -982,6 +987,10 @@ def count_placed_tasks(placements: Placements): ) ) + # NOP if there are no previous placements + if self._last_scheduler_placements is None: + return + num_placed = count_placed_tasks(self._last_scheduler_placements) num_unplaced = count_placed_tasks(self._last_scheduler_placements) - num_placed scheduler_runtime = event.time - self._last_scheduler_start_time @@ -1405,7 +1414,8 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: parent.remaining_time for parent in task_graph.get_parents(task) ) next_placement_time = event.time + max( - parent_completion_time, EventTime(1, EventTime.Unit.US) + parent_completion_time, + self._min_placement_push_duration, ) next_placement_event = Event( event_type=event.event_type, @@ -1462,7 +1472,7 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: task.id ] = event.placement else: - next_placement_time = event.time + EventTime(1, EventTime.Unit.US) + next_placement_time = event.time + self._min_placement_push_duration next_placement_event = Event( event_type=event.event_type, time=next_placement_time, From 98a5247e84ee6f027010766f65823712618dbaae Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 27 Nov 2024 09:21:04 -0500 Subject: [PATCH 078/128] Add log stats method to simulator --- simulator.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/simulator.py b/simulator.py index 9fc47137..7a8ef63b 100644 --- a/simulator.py +++ b/simulator.py @@ -43,6 +43,7 @@ class EventType(Enum): SCHEDULER_FINISHED = 12 # Signifies the end of the scheduler loop. SIMULATOR_END = 13 # Signify the end of the simulator loop. LOG_UTILIZATION = 14 # Ask the simulator to log worker pool utilization. + LOG_STATS = 15 # Log simulator statistics def __lt__(self, other) -> bool: # This method is used to order events in the event queue. We prioritize @@ -1763,17 +1764,17 @@ def __handle_event(self, event: Event) -> bool: self.__handle_scheduler_finish(event) elif event.event_type == EventType.SIMULATOR_END: # End of the simulator loop. + assert event.time == self._simulator_time + self.log_stats() self._csv_logger.debug( - f"{event.time.time},SIMULATOR_END,{self._finished_tasks}," - f"{self._cancelled_tasks},{self._missed_task_deadlines}," - f"{self._finished_task_graphs}," - f"{len(self._workload.get_cancelled_task_graphs())}," - f"{self._missed_task_graph_deadlines}" + f"{event.time.time},SIMULATOR_END", ) self._logger.info("[%s] Ending the simulator loop.", event.time.time) return True elif event.event_type == EventType.LOG_UTILIZATION: self.__log_utilization(event.time) + elif event.event_type == EventType.LOG_STATS: + self.log_stats(event.time) else: raise ValueError(f"[{event.time}] Retrieved event of unknown type: {event}") return False @@ -2154,3 +2155,12 @@ def __log_utilization(self, sim_time: EventTime): f"{worker_pool_resources.get_allocated_quantity(resource)}," f"{worker_pool_resources.get_available_quantity(resource)}" ) + + def log_stats(self): + self._csv_logger.debug( + f"{self._simulator_time.time},LOG_STATS,{self._finished_tasks}," + f"{self._cancelled_tasks},{self._missed_task_deadlines}," + f"{self._finished_task_graphs}," + f"{len(self._workload.get_cancelled_task_graphs())}," + f"{self._missed_task_graph_deadlines}" + ) From 618348a25ba48d5703875504bf33c4bac009ad72 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 27 Nov 2024 09:40:52 -0500 Subject: [PATCH 079/128] check for worker registration --- rpc/service.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 11a197a8..7dfce7a5 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -321,8 +321,8 @@ async def DeregisterDriver(self, request, context): async def RegisterTaskGraph(self, request, context): stime = self.__stime() - if not self.__framework_registered(): - msg = f"[{stime}] Trying to register a task graph (id={request.id}, name={request.name}) but no framework has been registered yet." + if not self.__worker_registered(): + msg = f"[{stime}] Failed to register task graph (id={request.id}, name={request.name}) because no worker has been registered yet." self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=False, message=msg, num_executors=0 @@ -672,6 +672,11 @@ def __stime(self) -> EventTime: def __framework_registered(self): return self._simulator is not None + def __worker_registered(self): + return ( + self.__framework_registered() and len(self.__get_worker_pool().workers) > 0 + ) + def __get_worker_pool(self): # Simulator maintains only one worker pool, so this should be fine return next(iter(self._simulator._worker_pools.worker_pools)) From 15bf380cc93d2993fc177024a0d7a7758f3cb140 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 27 Nov 2024 09:59:42 -0500 Subject: [PATCH 080/128] update documentation in service --- rpc/service.py | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 7dfce7a5..4979d591 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -4,7 +4,7 @@ import asyncio from concurrent import futures from urllib.parse import urlparse -from typing import Optional, Dict +from typing import Optional, Dict, Callable, Tuple from enum import Enum from dataclasses import dataclass @@ -83,30 +83,56 @@ def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: @dataclass class RegisteredApplication: - # TODO(elton): documentation + """ + Represents a registered application that can be used to generate task + graphs. It also manages the mapping between Spark stage IDs and canonical + task IDs. - gen: any # TODO(elton): proper type + A registered application is ready if the `task_graph` attribute is set. + + Attributes: + gen (Callable[[EventTime], Tuple[TaskGraph, Dict[int,int]]]): + A function that takes a release time and outputs: + - A task graph + - A mapping from Spark stage IDs to canonical task IDs + + task_graph (TaskGraph, optional): + The generated task graph for the application. Defaults to None. + + Methods: + generate_task_graph(release_time: EventTime): + Sets the `task_graph` attribute by generating a task graph for a + given `release_time`. + + spark_task_id(task_id: int): + Returns the canonical task ID corresponding to a Spark stage ID. + + canonical_task_id(stage_id: int): + Returns the Spark stage ID corresponding to a canonical task ID. + """ + + gen: Callable[[EventTime], Tuple[TaskGraph, Dict[int, int]]] task_graph: TaskGraph = None - _forward: any = None # TODO(elton): proper type - _backward: any = None # TODO(elton): proper type - _last_gen: any = None # TODO(elton): proper type + _forward: Dict[int, int] = None # spark stage id => canonical task id + _backward: Dict[int, int] = None # canonical task id => spark stage id + _last_gen: EventTime = None def __init__(self, gen): self.gen = gen - def generate_task_graph(self, release_time): + def generate_task_graph(self, release_time: EventTime): task_graph, stage_id_mapping = self.gen(release_time) self.task_graph = task_graph self._forward = stage_id_mapping self._backward = {v: k for k, v in self._forward.items()} self._last_gen = release_time - def spark_task_id(self, task_id): + def spark_task_id(self, task_id: int): return self._backward[task_id] - def canonical_task_id(self, task_id): - return self._forward[task_id] + def canonical_task_id(self, stage_id: int): + return self._forward[stage_id] class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): @@ -443,7 +469,7 @@ async def RegisterEnvironmentReady(self, request, context): self._simulator._event_queue.add_event(update_workload_event) self._simulator._event_queue.add_event(scheduler_start_event) self._logger.info( - f"[{stime}] Adding event {update_workload_event} to the simulator's event queue" + f"[{stime}] Added event {update_workload_event} to the simulator's event queue" ) self._logger.info( f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" From 0b3a661b7bdb57d214bc4d74ca50be534a5df72a Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 27 Nov 2024 12:17:33 -0500 Subject: [PATCH 081/128] updating the service to operate in US instead of S time unit --- rpc/service.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 4979d591..c35f5e49 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -241,7 +241,7 @@ async def RegisterFramework(self, request, context): t = int(time.time()) framework_name = request.name self._master_uri = request.uri - self._initialization_time = EventTime(t, EventTime.Unit.S) + self._initialization_time = EventTime(t, EventTime.Unit.US) stime = self.__stime() parsed_uri = urlparse(self._master_uri) @@ -398,7 +398,7 @@ async def RegisterTaskGraph(self, request, context): dependencies=dependencies, dataset_size=dataset_size, max_executors_per_job=max_executors_per_job, - runtime_unit=EventTime.Unit.S, + runtime_unit=EventTime.Unit.US, ) except Exception as e: msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" @@ -692,7 +692,9 @@ def __stime(self) -> EventTime: if self._initialization_time is None: return EventTime.invalid() ts = int(time.time()) - ts = EventTime(ts, EventTime.Unit.S) + # NOTE: The service runs in the US time unit for better compatibility with the simulator. + # The simulator uses an abstract unit of time, and it is all relative. + ts = EventTime(ts, EventTime.Unit.US) return ts - self._initialization_time def __framework_registered(self): From cca24d812ddd3da58e4846ceff14898f0cd4f5f4 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 27 Nov 2024 13:13:22 -0500 Subject: [PATCH 082/128] misc improvements --- rpc/service.py | 22 +++++++++++++++++++--- simulator.py | 16 +++++++++++----- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index c35f5e49..ca069f46 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -71,8 +71,8 @@ class DataLoader(Enum): class WorkloadLoader(BaseWorkloadLoader): - def __init__(self) -> None: - self._workload = Workload.empty() + def __init__(self, _flags) -> None: + self._workload = Workload.empty(_flags) def add_task_graph(self, task_graph: TaskGraph): self._workload.add_task_graph(task_graph) @@ -249,7 +249,7 @@ async def RegisterFramework(self, request, context): name=f"WorkerPool_{parsed_uri.netloc}", _logger=self._logger, ) - self._workload_loader = WorkloadLoader() + self._workload_loader = WorkloadLoader(FLAGS) # Enable orchestrated mode FLAGS.orchestrated = True @@ -298,6 +298,14 @@ async def DeregisterFramework(self, request, context): async def RegisterDriver(self, request, context): stime = self.__stime() + if not self.__worker_registered(): + msg = f"[{stime}] Failed to register driver (id={request.id}) because no worker has been registered yet." + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message=msg, + ) + if request.id in self._registered_app_drivers: msg = f"[{stime}] Driver with id '{request.id}' is already registered" self._logger.error(msg) @@ -337,6 +345,14 @@ async def DeregisterDriver(self, request, context): task_graph_name = self._registered_app_drivers[request.id] del self._registered_app_drivers[request.id] + # Log stats + log_stats_event = Event( + event_type=EventType.LOG_STATS, + time=stime, + ) + with self._lock: + self._simulator._event_queue.add_event(log_stats_event) + msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}" self._logger.info(msg) return erdos_scheduler_pb2.DeregisterDriverResponse( diff --git a/simulator.py b/simulator.py index 7a8ef63b..98b97a9e 100644 --- a/simulator.py +++ b/simulator.py @@ -1138,6 +1138,7 @@ def __handle_task_cancellation(self, event: Event) -> None: f"{event.task.timestamp},{event.task.id},{event.task.task_graph}," f"{event.task.slowest_execution_strategy.runtime.time}" ) + self.log_stats(event.time) # If the task already had a placement, we remove the placement from our queue. if event.task.id in self._future_placement_events: @@ -1255,8 +1256,12 @@ def __handle_task_finished(self, event: Event) -> None: f"{task_graph.deadline.to(EventTime.Unit.US).time}," f"{tardiness.to(EventTime.Unit.US).time}" ) + if task_graph.deadline < event.time: self._missed_task_graph_deadlines += 1 + + self.log_stats(event.time) + self._logger.info( "[%s] Finished the TaskGraph %s with a deadline %s at the " "completion of the task %s with a tardiness of %s.", @@ -1764,8 +1769,7 @@ def __handle_event(self, event: Event) -> bool: self.__handle_scheduler_finish(event) elif event.event_type == EventType.SIMULATOR_END: # End of the simulator loop. - assert event.time == self._simulator_time - self.log_stats() + self.log_stats(event.time) self._csv_logger.debug( f"{event.time.time},SIMULATOR_END", ) @@ -1787,7 +1791,9 @@ def __step(self, step_size: EventTime = EventTime(1, EventTime.Unit.US)) -> None the clock (in us). """ if step_size < EventTime.zero(): - raise ValueError(f"Simulator cannot step backwards {step_size}") + raise ValueError( + f"[{self._simulator_time}] Simulator cannot step backwards {step_size}" + ) # Step the simulator for the required steps and construct TASK_FINISHED events # for any tasks that were able to complete their execution. @@ -2156,9 +2162,9 @@ def __log_utilization(self, sim_time: EventTime): f"{worker_pool_resources.get_available_quantity(resource)}" ) - def log_stats(self): + def log_stats(self, sim_time: EventTime): self._csv_logger.debug( - f"{self._simulator_time.time},LOG_STATS,{self._finished_tasks}," + f"{sim_time.time},LOG_STATS,{self._finished_tasks}," f"{self._cancelled_tasks},{self._missed_task_deadlines}," f"{self._finished_task_graphs}," f"{len(self._workload.get_cancelled_task_graphs())}," From 7387b063ef62a59a4453ba1eeffda3b36e025b91 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 27 Nov 2024 13:57:59 -0500 Subject: [PATCH 083/128] override flags in service --- rpc/service.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index ca069f46..cd25d664 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -137,6 +137,15 @@ def canonical_task_id(self, stage_id: int): class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): def __init__(self) -> None: + # Override some flags + + # Enable orchestrated mode + FLAGS.orchestrated = True + # Set minimum placement push duration to 1s + FLAGS.min_placement_push_duration = 1_000_000 + # Set scheduler runtime to zero + FLAGS.scheduler_runtime = 0 + self._logger = setup_logging( name=__name__, log_dir=FLAGS.log_dir, @@ -251,10 +260,6 @@ async def RegisterFramework(self, request, context): ) self._workload_loader = WorkloadLoader(FLAGS) - # Enable orchestrated mode - FLAGS.orchestrated = True - # Set minimum placement push duration to 1s - FLAGS.min_placement_push_duration = 1_000_000 self._simulator = Simulator( scheduler=self._scheduler, worker_pools=WorkerPools( @@ -583,6 +588,7 @@ async def GetPlacements(self, request, context): for placement in sim_placements: # Ignore virtual placements if placement.task.state < TaskState.RELEASED: + self._logger.debug("[{stime}] Skipping placement: {placement}") continue worker_id = ( From 1461a3cab39676f33bc0a7841057dc8c3bd0e0a8 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 27 Nov 2024 16:58:27 -0500 Subject: [PATCH 084/128] fix workload release bug --- rpc/service.py | 2 -- simulator.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index cd25d664..68bf8fec 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -141,8 +141,6 @@ def __init__(self) -> None: # Enable orchestrated mode FLAGS.orchestrated = True - # Set minimum placement push duration to 1s - FLAGS.min_placement_push_duration = 1_000_000 # Set scheduler runtime to zero FLAGS.scheduler_runtime = 0 diff --git a/simulator.py b/simulator.py index 98b97a9e..3a937ddf 100644 --- a/simulator.py +++ b/simulator.py @@ -525,6 +525,8 @@ def tick(self, until: EventTime) -> None: """Tick the simulator until the specified time""" def f(): + self._logger.debug(f"EQ: {self._event_queue}") + time_until_next_event = self.__time_until_next_event() if ( @@ -1618,6 +1620,14 @@ def __handle_update_workload(self, event: Event) -> None: # Release the Tasks that have become available. releasable_tasks = self._workload.get_releasable_tasks() + + # Ignore non-source tasks, they get auto-released when the parent finishes + def is_source_task(task): + task_graph = self._workload.get_task_graph(task.task_graph) + return task_graph.is_source_task(task) + + releasable_tasks = [task for task in releasable_tasks if is_source_task(task)] + self._logger.info( "[%s] The WorkloadLoader %s has %s TaskGraphs that released %s tasks.", self._simulator_time.to(EventTime.Unit.US).time, @@ -1666,6 +1676,7 @@ def __handle_update_workload(self, event: Event) -> None: max_release_time = self._simulator_time for task in releasable_tasks: + event = Event( event_type=EventType.TASK_RELEASE, time=task.release_time, task=task ) From 209a4a3644f78c7c8d9bd450b1e9598cf1f6549c Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 28 Nov 2024 13:48:03 -0500 Subject: [PATCH 085/128] update docs about sched time discretization --- main.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 0bc956dc..b77e0b6d 100644 --- a/main.py +++ b/main.py @@ -391,8 +391,9 @@ flags.DEFINE_integer( "scheduler_max_time_discretization", 5, - "The maximum discretization that the scheduler can have (in µs). " - "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)", + "The maximum discretization that the scheduler can have. " + "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)." + "Be careful about the EventTime.Unit. Some parts of the code assume Unit.US", ) flags.DEFINE_float( "scheduler_max_occupancy_threshold", @@ -431,9 +432,10 @@ "scheduler_time_discretization", 1, "The length of each slot in the space-time matrix to consider for scheduling the " - "tasks (in µs). The default value is 1µs, and a higher value can lead to faster " + "tasks. The default value is 1 (see note for unit), and a higher value can lead to faster " "solutions but a potentially lower goodput due to resources being blocked for the " - "entirety of the slot.", + "entirety of the slot. NOTE: Since time in the simulator is an abstract concept, be " + "careful about the EventTime.Unit. Some parts of the code might assume Unit.US", ) flags.DEFINE_enum( "scheduler_policy", From 9104f7e3c6332ba308d1348ef2bbf329b0f8be5b Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 28 Nov 2024 13:49:11 -0500 Subject: [PATCH 086/128] added comments about setting deadlines in generate_task_graph --- workload/jobs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/workload/jobs.py b/workload/jobs.py index e0acb2e4..8d7a90d3 100644 --- a/workload/jobs.py +++ b/workload/jobs.py @@ -809,6 +809,9 @@ def _generate_task_graph( # TODO (Sukrit): Right now, this assumes that all Tasks in the TaskGraph come # with the same deadline. At some point, we will have to implement a # heuristic-based deadline splitting technique. + + # NOTE: The taskgraph deadline is re-generated (and overwritten) after + # use_branch_predicated_deadlines code, since fuzz is invoked again there. task_deadline = release_time + self.completion_time.fuzz( deadline_variance, deadline_bounds ) @@ -883,6 +886,8 @@ def _generate_task_graph( else: weighted_task_graph_length = self.__get_completion_time() + # NOTE: This is the second time the deadline is being set, based on a second + # invocation of fuzz. task_graph_deadline = release_time + weighted_task_graph_length.fuzz( deadline_variance, deadline_bounds ) From 8fd7f2a0951b2dd2332ac942ba44355e0d8d702a Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 28 Nov 2024 15:28:24 -0500 Subject: [PATCH 087/128] updates to test script (verified across edf, fifo, dsched) --- tests/test_service.py | 65 +++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/tests/test_service.py b/tests/test_service.py index 8fb54911..1b92030f 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -175,7 +175,7 @@ def test_service(): actual_task_ids.add(placement.task_id) assert actual_task_ids == {2} - # Register the second (correct) TaskGraph, wont be able to run due to inadequate resources + # Attempt to register the second TaskGraph, wont be able to run due to inadequate resources request = erdos_scheduler_pb2.RegisterTaskGraphRequest( id="task-graph-1", name="TPCH Query 4 50 200", @@ -189,10 +189,33 @@ def test_service(): ], ) response = stub.RegisterTaskGraph(request) + assert ( + not response.success + and re.search( + r"The worker Pool cannot accomodate the task graph 'task-graph-1'", + response.message, + ) + and response.num_executors == 0 + ) + + # Register the third TaskGraph, will run but will get cancelled due to deadline miss + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph-2", + name="TPCH Query 4 50 50", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]}, + {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]}, + {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]}, + {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]}, + {"key": {"id": 4, "name": "stage 4"}, "children_ids": []}, + ], + ) + response = stub.RegisterTaskGraph(request) assert ( response.success and re.search( - r"Registered task graph 'task-graph-1' successfully", + r"Registered task graph 'task-graph-2' successfully", response.message, ) and response.num_executors == 10 @@ -203,33 +226,33 @@ def test_service(): # Mark the environment as ready request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( - id="task-graph-1", + id="task-graph-2", num_executors=10, timestamp=1234567890, ) response = stub.RegisterEnvironmentReady(request) assert response.success and re.search( - r"Successfully marked environment as ready for task graph 'Q4\[task-graph-1\]@1'", + r"Successfully marked environment as ready for task graph 'Q4\[task-graph-2\]@1'", response.message, ) # Wait for 10s to get the placements for the second task graph time.sleep(10) - # Get placements for the task, none should be placed since worker has inadequate resources + # Get placements for the taskgraph 3, one of first two root vertices should be placed since there are resources request = erdos_scheduler_pb2.GetPlacementsRequest( timestamp=1234567890, - id="task-graph-1", + id="task-graph-2", ) response = stub.GetPlacements(request) assert response.success actual_task_ids = set() for placement in response.placements: assert ( - placement.worker_id == "1234" and placement.application_id == "task-graph-1" + placement.worker_id == "1234" and placement.application_id == "task-graph-2" ) actual_task_ids.add(placement.task_id) - assert len(actual_task_ids) == 0 + assert actual_task_ids == {1} # Wait for 100 more seconds and request placements again time.sleep(100) @@ -244,21 +267,33 @@ def test_service(): # Wait for 2 seconds to allow scheduler to process task completion and run scheduler time.sleep(2) - # Get placements for the task, entire taskgraph should be cancelled + # Get placements for the task, entire taskgraph should be cancelled since deadline missed + # Other root vertex (0) will be cancelled first. Then the subsequent vertices. + # NOTE: The simulator will return all current placements for a taskgraph (including + # those already sent by the service) until the task is marked as finished. Spark will ignore it. + # In this scenario of task-graph-2, placements has two values- Task 0 in cancelled state and + # Task 1 in running state. The service will return both of them. request = erdos_scheduler_pb2.GetPlacementsRequest( timestamp=1234567890, - id="task-graph-1", + id="task-graph-2", ) response = stub.GetPlacements(request) print(response) assert response.success actual_task_ids = set() for placement in response.placements: - assert ( - placement.worker_id == "None" - and placement.application_id == "task-graph-1" - and placement.cancelled == True - ) + if placement.task_id == 0: + assert ( + placement.worker_id == "None" + and placement.application_id == "task-graph-2" + and placement.cancelled == True + ) + if placement.task_id == 1: + assert ( + placement.worker_id == "1234" + and placement.application_id == "task-graph-2" + and placement.cancelled == False + ) actual_task_ids.add(placement.task_id) assert actual_task_ids == {0, 1} From 9cf701a17ca4820a9a05ceb07a040b0c2baeb78d Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 28 Nov 2024 15:52:32 -0500 Subject: [PATCH 088/128] update documentation for the service --- rpc/spark_erdos_setup.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index 0a6775fc..005ed44b 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -151,9 +151,23 @@ pip install -r rpc/requirements.txt python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto ``` -### Run the service using +### Run the service ```bash -python -m rpc.service +python -m rpc.service --enforce_deadlines --scheduler_runtime=0 +``` + +The above command uses the default argument values from the `service.py` and `main.py`. The default scheduler is `EDF`. Other options available for the +service are `FIFO` and `TetriSched`. The DSched scheduler is a specific instantiation of the `TetriSched` scheduler. The other schedulers can be run +as follows: + +#### To instantiate FIFO scheduler for the service: +```bash +python -m rpc.service --scheduler=FIFO --enforce_deadlines --scheduler_runtime=0 +``` + +#### To instantiate DSched scheduler for the service: +```bash +python -m rpc.service --scheduler=TetriSched --enforce_deadlines --scheduler_runtime=0 --release_taskgraphs --opt_passes=CRITICAL_PATH_PASS --opt_passes=CAPACITY_CONSTRAINT_PURGE_PASS --oppasses=DYNAMIC_DISCRETIZATION_PASS --retract_schedules --scheduler_max_occupancy_threshold=0.999 --finer_discretization_at_prev_solution --scheduler_selective_rescheduling --scheduler_reconsideration_period=0.6 --scheduler_time_discretization=1 --scheduler_max_time_discretization=5 --finer_discretization_window=5 --scheduler_log_to_file ``` ### Run local tests for the erdos-spark service @@ -168,6 +182,10 @@ pytest tests/test_service.py ```bash python -m rpc.service ``` +Refer to the above section to instantiate different schedulers for the service. + +> NOTE: Since we emulate a 20-node spark cluster on a single system, an additional flag `--override_worker_cpu_count` needs to be passed in the +> service launch command. ### Start all components of the spark cluster Run the following commands from the root directory of the `spark-mirror` repository. From 25f4a3d9d539111d52b33a41adecd3d811b71194 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Fri, 29 Nov 2024 10:59:05 -0500 Subject: [PATCH 089/128] correctly handle task cancellations --- rpc/protos/rpc/erdos_scheduler.proto | 2 +- rpc/service.py | 180 ++++++++++++++++++--------- simulator.py | 8 +- 3 files changed, 126 insertions(+), 64 deletions(-) diff --git a/rpc/protos/rpc/erdos_scheduler.proto b/rpc/protos/rpc/erdos_scheduler.proto index 0767be83..e49ec8c4 100644 --- a/rpc/protos/rpc/erdos_scheduler.proto +++ b/rpc/protos/rpc/erdos_scheduler.proto @@ -193,11 +193,11 @@ message Placement { string application_id = 2; uint32 task_id = 3; uint32 cores = 4; - bool cancelled = 5; // If the task (and thereby the task graph) should be cancelled } message GetPlacementsResponse { bool success = 1; repeated Placement placements = 2; string message = 3; + bool terminate = 4; // terminate the task graph } diff --git a/rpc/service.py b/rpc/service.py index 68bf8fec..6ab9b143 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -348,12 +348,11 @@ async def DeregisterDriver(self, request, context): task_graph_name = self._registered_app_drivers[request.id] del self._registered_app_drivers[request.id] - # Log stats - log_stats_event = Event( - event_type=EventType.LOG_STATS, - time=stime, - ) with self._lock: + log_stats_event = Event( + event_type=EventType.LOG_STATS, + time=self.__stime(), + ) self._simulator._event_queue.add_event(log_stats_event) msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}" @@ -469,27 +468,28 @@ async def RegisterEnvironmentReady(self, request, context): ) r = self._registered_applications[request.id] - - # Generate the task graph now r.generate_task_graph(stime) - self._workload_loader.add_task_graph(r.task_graph) - - update_workload_event = Event( - event_type=EventType.UPDATE_WORKLOAD, - time=stime, - ) - scheduler_start_event = Event( - event_type=EventType.SCHEDULER_START, - time=stime.to(EventTime.Unit.US), - ) - with self._lock: - self._simulator._event_queue.add_event(update_workload_event) - self._simulator._event_queue.add_event(scheduler_start_event) - self._logger.info( - f"[{stime}] Added event {update_workload_event} to the simulator's event queue" + self._simulator._workload.add_task_graph(r.task_graph) + self._simulator._current_task_graph_placements[r.task_graph.name] = {} + + for task in r.task_graph.get_releasable_tasks(): + task_release_event = Event( + event_type=EventType.TASK_RELEASE, + time=self.__stime(), + task=task, + ) + self._logger.info( + f"[{stime}] Added event {task_release_event} to the simulator's event queue", + ) + self._simulator._event_queue.add_event(task_release_event) + + scheduler_start_event = Event( + event_type=EventType.SCHEDULER_START, + time=self.__stime(), ) + self._simulator._event_queue.add_event(scheduler_start_event) self._logger.info( f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" ) @@ -563,30 +563,98 @@ async def GetPlacements(self, request, context): placements=[], ) - # Check if the task graph is active if r.task_graph.is_complete(): msg = f"[{stime}] Task graph '{r.task_graph.name}' is complete. No more placements to provide." self._logger.error(msg) return erdos_scheduler_pb2.GetPlacementsResponse( - success=False, + success=True, message=msg, ) + # A task graph is considered complete if **all** of its **sink** tasks + # are complete. It is considered cancelled if **any** of its **sink** + # tasks are cancelled. + + # If the task graph is complete, the Spark application will + # automatically shut down because it knows that all of its stages have + # finished executing. + + # Matters get interesting in the presence of task cancellations. The + # service is aware of which tasks are cancelled. + + # First, even when a task graph is cancelled, the simulator (without + # orchestration) + # continues to schedule and execute any tasks that were released into + # the system. The service, which runs the simulator in orchestrated + # mode, must emulate this behavior to maintain parity. + + # Second, from Spark's point of view, however, those tasks are still + # pending placements. So, Spark will continue to periodically invoke + # `GetPlacements` in the hopes of receiving placements for those + # cancelled tasks. Left unhandled, the Spark application will loop + # indefinitely waiting for placements. + + # We _could_ communicate these task cancellations to Spark. Then, we + # can modify the DAGScheduler to invoke GetPlacements until all of its + # stages have either finished executing or have been cancelled, after + # which it can safely terminate the application. + + # However, we run into an issue due to VIRTUAL tasks. When a task is + # cancelled, the simulator invokes `TaskGraph.cancel(task)`. + # `TaskGraph.cancel(task)` traverses the tree rooted at `task` + # depth-first, cancelling tasks along the way until it finds the first + # terminal task. As a consequence, it is possible for the tree rooted + # at a cancelled task to have VIRTUAL tasks inside of it. These + # virtual tasks will never receive placements because they are not + # releasable. So, it is possible for the Spark application to stall on + # `GetPlacements` waiting on placements for these virtual tasks. + + # Since the service knows the state of each task, it is easy then for + # the service to determine when the Spark application should terminate + # in the presence of task cancellations. + + # So, instead of communicating task cancellations, we communicate when + # the Spark application should terminate. + # + # The first check makes sure all tasks are either CANCELLED, + # COMPLETED, or VIRTUAL. We check for all tasks because it is possible + # that the simulator is processing released and scheduled tasks. If we + # terminate early, then we will never receive `NotifyTaskCompletion`s + # for those tasks (because the Spark application was terminated), + # which then results in those tasks never getting removed from the + # worker pool. + # + # The second check makes sure that the task graph is indeed cancelled. + # We have this additional guard because at the start all tasks are + # VIRTUAL and we don't want to terminate the application then. + + should_terminate = all( + task.state + in ( + TaskState.CANCELLED, + TaskState.COMPLETED, + TaskState.VIRTUAL, + ) + for task in r.task_graph + ) and (r.task_graph.is_cancelled()) + if should_terminate: + msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled. No more placements to provide." + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=msg, + terminate=True, + ) + with self._lock: sim_placements = self._simulator.get_current_placements_for_task_graph( r.task_graph.name ) - self._logger.info( - f"Received the following placements for '{r.task_graph.name}': {sim_placements}" - ) - - # Construct response. Notably, we apply stage-id mapping placements = [] for placement in sim_placements: - # Ignore virtual placements - if placement.task.state < TaskState.RELEASED: - self._logger.debug("[{stime}] Skipping placement: {placement}") + if placement.task.state != TaskState.RUNNING: + self._logger.debug(f"[{stime}] Skipping placement: {placement}") continue worker_id = ( @@ -601,23 +669,17 @@ async def GetPlacements(self, request, context): else 0 ) - if placement.placement_type not in ( - Placement.PlacementType.PLACE_TASK, - Placement.PlacementType.CANCEL_TASK, - ): + if placement.placement_type not in (Placement.PlacementType.PLACE_TASK,): raise NotImplementedError placements.append( { "worker_id": worker_id, "application_id": request.id, - "task_id": int(task_id), + "task_id": task_id, "cores": cores, - "cancelled": placement.placement_type - == Placement.PlacementType.CANCEL_TASK, - } + }, ) - self._logger.info(f"Sending placements for '{r.task_graph.name}': {placements}") return erdos_scheduler_pb2.GetPlacementsResponse( success=True, @@ -660,28 +722,28 @@ async def NotifyTaskCompletion(self, request, context): task.start_time + task.slowest_execution_strategy.runtime ) - # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is - # inaccurate for task completion notifications that occur past that time. Thus, a max of the current and actual completion time - # is taken to ensure that the task is marked completed at the correct time. - task_finished_event = Event( - event_type=EventType.TASK_FINISHED, - time=max(actual_task_completion_time, stime), - task=task, - ) - scheduler_start_event = Event( - event_type=EventType.SCHEDULER_START, - time=max( - actual_task_completion_time.to(EventTime.Unit.US), - stime.to(EventTime.Unit.US), - ), - ) - with self._lock: + # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is + # inaccurate for task completion notifications that occur past that time. Thus, a max of the current and actual completion time + # is taken to ensure that the task is marked completed at the correct time. + task_finished_event = Event( + event_type=EventType.TASK_FINISHED, + time=max(actual_task_completion_time, self.__stime()), + task=task, + ) self._simulator._event_queue.add_event(task_finished_event) - self._simulator._event_queue.add_event(scheduler_start_event) self._logger.info( f"[{stime}] Adding event {task_finished_event} to the simulator's event queue" ) + + scheduler_start_event = Event( + event_type=EventType.SCHEDULER_START, + time=max( + actual_task_completion_time.to(EventTime.Unit.US), + self.__stime(), + ), + ) + self._simulator._event_queue.add_event(scheduler_start_event) self._logger.info( f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" ) @@ -698,7 +760,7 @@ async def _tick_simulator(self): with self._lock: if self._simulator is not None: stime = self.__stime() - self._logger.debug(f"[{stime}] Simulator tick") + # self._logger.debug(f"[{stime}] Simulator tick") self._simulator.tick(until=stime) # else: # print("Simulator instance is None") diff --git a/simulator.py b/simulator.py index 3a937ddf..446ecc52 100644 --- a/simulator.py +++ b/simulator.py @@ -525,8 +525,6 @@ def tick(self, until: EventTime) -> None: """Tick the simulator until the specified time""" def f(): - self._logger.debug(f"EQ: {self._event_queue}") - time_until_next_event = self.__time_until_next_event() if ( @@ -1224,6 +1222,7 @@ def __handle_task_finished(self, event: Event) -> None: task_placed_at_worker_pool = self._worker_pools.get_worker_pool( event.task.worker_pool_id ) + task_placed_at_worker_pool.remove_task(current_time=event.time, task=event.task) # Remove the task from it's task graph's current placements @@ -1626,7 +1625,9 @@ def is_source_task(task): task_graph = self._workload.get_task_graph(task.task_graph) return task_graph.is_source_task(task) - releasable_tasks = [task for task in releasable_tasks if is_source_task(task)] + releasable_tasks = [ + task for task in releasable_tasks if is_source_task(task) + ] self._logger.info( "[%s] The WorkloadLoader %s has %s TaskGraphs that released %s tasks.", @@ -1676,7 +1677,6 @@ def is_source_task(task): max_release_time = self._simulator_time for task in releasable_tasks: - event = Event( event_type=EventType.TASK_RELEASE, time=task.release_time, task=task ) From 4d979c1cfb1801a990c400c5c05003b9a419dd98 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Fri, 29 Nov 2024 18:09:22 -0500 Subject: [PATCH 090/128] simulator bug fixes and improvements - update placement time of pushed placement - dedup scheduler events with the same timestamp --- simulator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/simulator.py b/simulator.py index 446ecc52..d80335fe 100644 --- a/simulator.py +++ b/simulator.py @@ -341,7 +341,7 @@ def event_representation_filter(record): self.__log_utilization(self._simulator_time) # Internal data. - self._last_scheduler_start_time = self._simulator_time + self._last_scheduler_start_time = EventTime.invalid() self._next_scheduler_event = None self._last_scheduler_placements: Optional[Placements] = None @@ -582,6 +582,10 @@ def __handle_scheduler_start(self, event: Event) -> None: Args: event (`Event`): The event to handle. """ + + if self._last_scheduler_start_time == event.time: + return + # Log the required CSV information. currently_placed_tasks = self._worker_pools.get_placed_tasks() schedulable_tasks = self._workload.get_schedulable_tasks( @@ -1430,6 +1434,7 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: task=event.task, placement=event.placement, ) + event.placement._placement_time = next_placement_time self._future_placement_events[task.id] = next_placement_event self._event_queue.add_event(next_placement_event) self._logger.info( From 48a9711ae78d78ce7d6cbc4918c1936671fa122b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 2 Dec 2024 17:15:11 -0500 Subject: [PATCH 091/128] add launch script --- rpc/launch_tpch_queries.py | 228 +++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 rpc/launch_tpch_queries.py diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py new file mode 100644 index 00000000..09beae64 --- /dev/null +++ b/rpc/launch_tpch_queries.py @@ -0,0 +1,228 @@ +import argparse +import os +import random +import subprocess +import sys +import time +import numpy as np + +from pathlib import Path + +from workload import JobGraph +from utils import EventTime +from data.tpch_loader import make_release_policy + + +def map_dataset_to_deadline(dataset_size): + # 50gb => 2mins, 100gb => 6mins, 250gb => 12mins, 500gb => 24mins + mapping = {"50": 120, "100": 360, "250": 720, "500": 1440} + return mapping.get(dataset_size, 120) # Default to 120s if dataset size is NA + + +def launch_query(query_number, args): + deadline = map_dataset_to_deadline(args.dataset_size) + + cmd = [ + f"{args.spark_mirror_path.resolve()}/bin/spark-submit", + *("--deploy-mode", "cluster"), + *("--master", "spark://130.207.125.81:7077"), + *("--conf", "'spark.port.maxRetries=132'"), + *("--conf", "'spark.eventLog.enabled=true'"), + *("--conf", f"'spark.eventLog.dir={args.spark_eventlog_dir.resolve()}'"), + *("--conf", "'spark.sql.adaptive.enabled=false'"), + *("--conf", "'spark.sql.adaptive.coalescePartitions.enabled=false'"), + *("--conf", "'spark.sql.autoBroadcastJoinThreshold=-1'"), + *("--conf", "'spark.sql.shuffle.partitions=1'"), + *("--conf", "'spark.sql.files.minPartitionNum=1'"), + *("--conf", "'spark.sql.files.maxPartitionNum=1'"), + *("--conf", f"'spark.app.deadline={deadline}'"), + *("--class", "'main.scala.TpchQuery'"), + f"{args.tpch_spark_path.resolve()}/target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar", + f"{query_number}", + f"{args.dataset_size}", + f"{args.max_cores}", + ] + + # print( + # f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Launching Query: {query_number}, " + # f"dataset: {args.dataset_size}GB, deadline: {deadline}s, maxCores: {args.max_cores}" + # ) + + try: + cmd = ' '.join(cmd) + print("Launching:", cmd) + subprocess.Popen( + cmd, + shell=True, + ) + print("Query launched successfully.") + except Exception as e: + print(f"Error launching query: {e}") + + +def generate_release_times(rng, args): + if args.distribution == "periodic": + release_policy_args = { + "period": EventTime(args.period, EventTime.Unit.US), + } + elif args.distribution == "fixed": + release_policy_args = { + "period": EventTime(args.period, EventTime.Unit.US), + "num_invocations": args.num_queries, + } + elif args.distribution == "poisson": + release_policy_args = { + "rate": args.variable_arrival_rate, + "num_invocations": args.num_queries, + } + elif args.distribution == "gamma": + release_policy_args = { + "rate": args.variable_arrival_rate, + "num_invocations": args.num_queries, + "coefficient": args.coefficient, + } + elif args.distribution == "fixed_gamma": + release_policy_args = { + "variable_arrival_rate": args.variable_arrival_rate, + "base_arrival_rate": args.base_arrival_rate, + "num_invocations": args.num_queries, + "coefficient": args.coefficient, + } + else: + raise NotImplementedError( + f"Release policy {args.distribution} not implemented." + ) + + release_policy = make_release_policy( + args.distribution, + release_policy_args, + rng, + args.rng_seed, + (args.randomize_start_time_min, args.randomize_start_time_max), + ) + + release_times = release_policy.get_release_times( + completion_time=EventTime(sys.maxsize, EventTime.Unit.US) + ) + + return release_times + + +def main(): + parser = argparse.ArgumentParser( + description="Generate a workload of queries based on distribution type." + ) + parser.add_argument( + "--spark-mirror-path", + type=Path, + required=True, + help="Path to spark-mirror repository", + ) + parser.add_argument( + "--tpch-spark-path", + type=Path, + required=True, + help="Path to TPC-H Spark repository", + ) + parser.add_argument( + "--spark-eventlog-dir", + default=Path(os.getcwd()) / "spark-eventlog", + type=Path, + help="Path to directory in which to Spark event logs will be dumped", + ) + parser.add_argument( + "--distribution", + choices=["periodic", "fixed", "poisson", "gamma", "closed_loop", "fixed_gamma"], + default="gamma", + help="Type of distribution for query inter-arrival times (default: gamma)", + ) + parser.add_argument( + "--num_queries", + type=int, + default=50, + help="Number of queries to generate (default: 50)", + ) + parser.add_argument( + "--dataset_size", + choices=["50", "100", "250", "500"], + default="50", + help="Dataset size per query in GB (default: 50)", + ) + parser.add_argument( + "--max_cores", + type=int, + choices=[50, 75, 100, 200], + default=50, + help="Maximum executor cores (default: 50)", + ) + parser.add_argument( + "--period", + type=int, + default=25, + help="Releases a DAG after period time has elapsed", + ) + parser.add_argument( + "--variable_arrival_rate", + type=float, + default=1.0, + help="Variable arrival rate for poisson and gamma distributions", + ) + parser.add_argument( + "--coefficient", + type=float, + default=1.0, + help="Coefficient for poisson and gamma distributions", + ) + parser.add_argument( + "--base_arrival_rate", + type=float, + default=1.0, + help="Base arrival rate for fixed_gamma distribution", + ) + parser.add_argument("--randomize_start_time_min", type=int, default=0) + parser.add_argument("--randomize_start_time_max", type=int, default=0) + parser.add_argument( + "--rng_seed", + type=int, + default=1234, + help="RNG seed for generating inter-arrival periods and picking DAGs (default: 1234)", + ) + parser.add_argument("--queries", type=int, nargs='+', help="Launch specific queries") + + args = parser.parse_args() + + if not args.spark_eventlog_dir.exists(): + args.spark_eventlog_dir.mkdir(parents=True) + + os.environ["TPCH_INPUT_DATA_DIR"] = str(args.tpch_spark_path.resolve() / "dbgen") + + if args.queries: + assert(len(args.queries) == args.num_queries) + + rng = random.Random(args.rng_seed) + + # Generate release times + release_times = generate_release_times(rng, args) + print("Release times:", release_times) + + # Launch queries + inter_arrival_times = [release_times[0].time] + for i in range(len(release_times) - 1): + inter_arrival_times.append(release_times[i + 1].time - release_times[i].time) + for i, inter_arrival_time in enumerate(inter_arrival_times): + time.sleep(inter_arrival_time) + if args.queries: + query_number = args.queries[i] + else: + query_number = rng.randint(1, 22) + launch_query(query_number, args) + print( + "Current time: ", + time.strftime("%Y-%m-%d %H:%M:%S"), + " launching query: ", + query_number, + ) + + +if __name__ == "__main__": + main() From b1942ea491ea33d0810e9aa8c6536d97add8c39e Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 2 Dec 2024 22:59:36 -0500 Subject: [PATCH 092/128] add shutdown rpc method --- rpc/launch_tpch_queries.py | 26 +++++++++++++++++++++----- rpc/protos/rpc/erdos_scheduler.proto | 4 ++++ rpc/service.py | 15 +++++++++++++-- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py index 09beae64..73fd3a44 100644 --- a/rpc/launch_tpch_queries.py +++ b/rpc/launch_tpch_queries.py @@ -11,6 +11,10 @@ from workload import JobGraph from utils import EventTime from data.tpch_loader import make_release_policy +from rpc import erdos_scheduler_pb2 +from rpc import erdos_scheduler_pb2_grpc + +import grpc def map_dataset_to_deadline(dataset_size): @@ -49,13 +53,14 @@ def launch_query(query_number, args): # ) try: - cmd = ' '.join(cmd) + cmd = " ".join(cmd) print("Launching:", cmd) - subprocess.Popen( + p = subprocess.Popen( cmd, shell=True, ) print("Query launched successfully.") + return p except Exception as e: print(f"Error launching query: {e}") @@ -187,7 +192,9 @@ def main(): default=1234, help="RNG seed for generating inter-arrival periods and picking DAGs (default: 1234)", ) - parser.add_argument("--queries", type=int, nargs='+', help="Launch specific queries") + parser.add_argument( + "--queries", type=int, nargs="+", help="Launch specific queries" + ) args = parser.parse_args() @@ -197,7 +204,7 @@ def main(): os.environ["TPCH_INPUT_DATA_DIR"] = str(args.tpch_spark_path.resolve() / "dbgen") if args.queries: - assert(len(args.queries) == args.num_queries) + assert len(queries) == args.num_queries rng = random.Random(args.rng_seed) @@ -206,6 +213,7 @@ def main(): print("Release times:", release_times) # Launch queries + ps = [] inter_arrival_times = [release_times[0].time] for i in range(len(release_times) - 1): inter_arrival_times.append(release_times[i + 1].time - release_times[i].time) @@ -215,7 +223,7 @@ def main(): query_number = args.queries[i] else: query_number = rng.randint(1, 22) - launch_query(query_number, args) + ps.append(launch_query(query_number, args)) print( "Current time: ", time.strftime("%Y-%m-%d %H:%M:%S"), @@ -223,6 +231,14 @@ def main(): query_number, ) + for p in ps: + p.wait() + + channel = grpc.insecure_channel("localhost:50051") + stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel) + response = stub.Shutdown(erdos_scheduler_pb2.Empty()) + channel.close() + if __name__ == "__main__": main() diff --git a/rpc/protos/rpc/erdos_scheduler.proto b/rpc/protos/rpc/erdos_scheduler.proto index e49ec8c4..262254da 100644 --- a/rpc/protos/rpc/erdos_scheduler.proto +++ b/rpc/protos/rpc/erdos_scheduler.proto @@ -47,6 +47,8 @@ service SchedulerService { /// Notifies the Scheduler that a Task from a particular TaskGraph has completed.option rpc NotifyTaskCompletion(NotifyTaskCompletionRequest) returns (NotifyTaskCompletionResponse) {} + + rpc Shutdown(Empty) returns (Empty) {} } @@ -201,3 +203,5 @@ message GetPlacementsResponse { string message = 3; bool terminate = 4; // terminate the task graph } + +message Empty {} diff --git a/rpc/service.py b/rpc/service.py index 6ab9b143..6ebe5d48 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -136,7 +136,9 @@ def canonical_task_id(self, stage_id: int): class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): - def __init__(self) -> None: + def __init__(self, server) -> None: + self._server = server + # Override some flags # Enable orchestrated mode @@ -230,6 +232,7 @@ def __init__(self) -> None: self._registered_app_drivers = ( {} ) # Spark driver id differs from taskgraph name (application id) + self._shutdown = False self._lock = threading.Lock() super().__init__() @@ -357,6 +360,10 @@ async def DeregisterDriver(self, request, context): msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}" self._logger.info(msg) + + if len(self._registered_app_drivers) == 0 and self._shutdown: + await self._server.stop(0) + return erdos_scheduler_pb2.DeregisterDriverResponse( success=True, message=msg, @@ -755,6 +762,10 @@ async def NotifyTaskCompletion(self, request, context): message=msg, ) + async def Shutdown(self, request, context): + self._shutdown = True + return erdos_scheduler_pb2.Empty() + async def _tick_simulator(self): while True: with self._lock: @@ -819,7 +830,7 @@ def main(_argv): loop = asyncio.get_event_loop() server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) - servicer = Servicer() + servicer = Servicer(server) erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(servicer, server) server.add_insecure_port(f"[::]:{FLAGS.port}") From 2e70864c682001d686b39141492589ee82e2ef20 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 2 Dec 2024 23:00:24 -0500 Subject: [PATCH 093/128] add service experiment runner --- scripts/run_service_experiments.py | 172 +++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 scripts/run_service_experiments.py diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py new file mode 100644 index 00000000..d0cce5b6 --- /dev/null +++ b/scripts/run_service_experiments.py @@ -0,0 +1,172 @@ +import argparse +import subprocess +import time +import traceback +from pathlib import Path +from dataclasses import dataclass + +SPARK_MIRROR_PATH = str(Path("../spark_mirror").resolve()) +TPCH_SPARK_PATH = str(Path("../tpch-spark").resolve()) + + +def bang(cmd, dry_run): + cmd = [str(part) for part in cmd] + print(" ".join(cmd)) + if dry_run: + return + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return p + + +def must(cmd, dry_run): + p = bang(cmd, dry_run) + if p.wait() != 0: + stdout, stderr = p.communicate() + raise Exception(f"Command failed. stdout: {stdout}. stderr: {stderr}.") + return p + + +@dataclass +class Service: + service_args: any + output_dir: Path + dry_run: bool + + _service = None + _master = None + _worker = None + + def __enter__(self): + log_file = self.output_dir / "service.log" + csv_file = self.output_dir / "service.csv" + + # launch service + self._service = bang( + [ + *("python3", "-m", "rpc.service"), + *("--log", log_file), + *("--csv_file_name", csv_file), + *self.service_args, + ], + self.dry_run, + ) + + # sleep for some time + if not self.dry_run: + time.sleep(3) + + try: + # launch spark master and worker + self._master = must( + [ + f"{SPARK_MIRROR_PATH}/sbin/start-master.sh", + *("--host", "130.207.125.81"), + *( + "--properties-file", + f"{SPARK_MIRROR_PATH}/conf/spark-dg-config.conf", + ), + ], + self.dry_run, + ) + self._worker = must( + [ + f"{SPARK_MIRROR_PATH}/sbin/start-worker.sh", + "spark://130.207.125.81:7077", + *( + "--properties-file", + f"{SPARK_MIRROR_PATH}/conf/spark-dg-config.conf", + ), + ], + self.dry_run, + ) + except Exception as e: + self.clean() + raise e + + def clean(self): + if self._service: + self._service.wait() + if self._master: + must([f"{SPARK_MIRROR_PATH}/sbin/stop-master.sh"], self.dry_run) + if self._worker: + must([f"{SPARK_MIRROR_PATH}/sbin/stop-worker.sh"], self.dry_run) + + def __exit__(self, type, value, traceback): + self.clean() + + +@dataclass +class Launcher: + launcher_args: any + dry_run: bool + + def launch(self): + must( + [ + *("python3", "-m", "rpc.launch_tpch_queries"), + *self.launcher_args, + *("--spark-mirror-path", SPARK_MIRROR_PATH), + *("--tpch-spark-path", TPCH_SPARK_PATH), + ], + self.dry_run, + ) + + +@dataclass +class Experiment: + name: str + service_args: any + launcher_args: any + args: any + + def run(self): + output_dir = self.args.output_dir / self.name + if not output_dir.exists(): + output_dir.mkdir(parents=True) + + with Service( + service_args=self.service_args, + output_dir=output_dir, + dry_run=self.args.dry_run, + ) as s: + Launcher(self.launcher_args, self.args.dry_run).launch() + time.sleep(10) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dry-run", + action="store_true", + help="Prints commands that will be executed for each experiment", + ) + parser.add_argument("--output-dir", type=Path, default=Path("exp-output")) + args = parser.parse_args() + + if not args.output_dir.exists(): + args.output_dir.mkdir(parents=True) + + experiments = [ + Experiment( + name="testing", + service_args=[ + "--enforce_deadlines", + "--override_worker_cpu_count", + ], + launcher_args=["--num_queries", 1], + args=args, + ) + ] + + for i, experiment in enumerate(experiments): + try: + print(f"=== {experiment.name} ({i+1}/{len(experiments)}) ===") + experiment.run() + print("=== done ===") + except Exception as e: + print(traceback.format_exc()) + print(f"Failed to run experiment '{experiment}'. Exception: '{e}'") + + +if __name__ == "__main__": + main() From aeb4d6d97387c0cec24bbc6288b47425eb42b21b Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 2 Dec 2024 23:04:25 -0500 Subject: [PATCH 094/128] sleep for a bit before launching queries --- scripts/run_service_experiments.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index d0cce5b6..241d2884 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -83,6 +83,9 @@ def __enter__(self): self.clean() raise e + if not self.dry_run: + time.sleep(5) + def clean(self): if self._service: self._service.wait() From 4094e018e8715f85ebc8f3c659fe48e133446245 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Mon, 2 Dec 2024 23:08:30 -0500 Subject: [PATCH 095/128] remove extraneous sleep --- scripts/run_service_experiments.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index 241d2884..a3318f1e 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -133,7 +133,6 @@ def run(self): dry_run=self.args.dry_run, ) as s: Launcher(self.launcher_args, self.args.dry_run).launch() - time.sleep(10) def main(): From 167ba4948336f9c72c346e71b878c2401147552c Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Mon, 2 Dec 2024 23:45:57 -0500 Subject: [PATCH 096/128] set start_time of task to its placement time --- workload/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workload/tasks.py b/workload/tasks.py index aae07301..c929f696 100644 --- a/workload/tasks.py +++ b/workload/tasks.py @@ -224,6 +224,7 @@ def schedule( self._state = TaskState.SCHEDULED self._scheduling_time = time self._scheduler_placement = placement + self._start_time = placement.placement_time self._worker_pool_id = placement.worker_pool_id self.update_remaining_time(placement.execution_strategy.runtime) From 53cc7359c9bd89a6e266b5cf046050c162d05446 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Mon, 2 Dec 2024 23:47:10 -0500 Subject: [PATCH 097/128] add testcase to verify new taskgraph termination approach --- tests/test_service.py | 48 +++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/tests/test_service.py b/tests/test_service.py index 1b92030f..9f623445 100644 --- a/tests/test_service.py +++ b/tests/test_service.py @@ -267,35 +267,53 @@ def test_service(): # Wait for 2 seconds to allow scheduler to process task completion and run scheduler time.sleep(2) - # Get placements for the task, entire taskgraph should be cancelled since deadline missed - # Other root vertex (0) will be cancelled first. Then the subsequent vertices. - # NOTE: The simulator will return all current placements for a taskgraph (including - # those already sent by the service) until the task is marked as finished. Spark will ignore it. - # In this scenario of task-graph-2, placements has two values- Task 0 in cancelled state and - # Task 1 in running state. The service will return both of them. + # Get placements for the task, entire taskgraph would be cancelled since deadline has passed. + # Since one root vertex (1) is running, the other root vertex (0) will be cancelled first, + # then the subsequent vertices. + # NOTE: The service will wait until all running/ scheduled tasks complete and are removed + # from the workerpool before issuing a terminate=True for the taskgraph. Until then it will + # return current placements for a taskgraph (including those already sent by the service) + # and wait for running tasks to finish. Spark will ignore it. request = erdos_scheduler_pb2.GetPlacementsRequest( timestamp=1234567890, id="task-graph-2", ) response = stub.GetPlacements(request) - print(response) assert response.success actual_task_ids = set() + # Will return placement for task_id 1 for placement in response.placements: - if placement.task_id == 0: - assert ( - placement.worker_id == "None" - and placement.application_id == "task-graph-2" - and placement.cancelled == True - ) if placement.task_id == 1: assert ( placement.worker_id == "1234" and placement.application_id == "task-graph-2" - and placement.cancelled == False ) actual_task_ids.add(placement.task_id) - assert actual_task_ids == {0, 1} + assert actual_task_ids == {1} + + # Wait for 5s to issue notify task completion for task_id 1 in task-graph-2 + time.sleep(5) + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-2", task_id=1, timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + # Wait for 5s to allow the simulator to process the event. + # Invoke get placements again for task-graph 2, it should return terminate=True now + time.sleep(5) + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-2", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + # Will return placement for task_id 1 + for placement in response.placements: + actual_task_ids.add(placement.task_id) + assert len(actual_task_ids) == 0 + assert response.terminate == True # Deregister framework request = erdos_scheduler_pb2.DeregisterFrameworkRequest( From 29e306f9892c36e8bef9551ad3babb9b66f55ccc Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Mon, 2 Dec 2024 23:47:41 -0500 Subject: [PATCH 098/128] nit documentation in erdos-spark setup --- rpc/spark_erdos_setup.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md index 005ed44b..fe8f3678 100644 --- a/rpc/spark_erdos_setup.md +++ b/rpc/spark_erdos_setup.md @@ -52,6 +52,9 @@ Clone the repository with submodules git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive ``` +> NOTE: If the submodule was cloned earlier but has been updated since, `git fetch --all` will not be able to track those changes. To pull in updates +> from submodule's parent, run `git submodule update --init --recursive`. + ### Verify branch Verify or set current branch `erdos-spark-integration` From 14190ad0548aa725940de6ed8963004c69d22c76 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Mon, 2 Dec 2024 23:48:43 -0500 Subject: [PATCH 099/128] more explainable msgs from GetPlacements --- rpc/service.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 6ebe5d48..11040e41 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -634,6 +634,9 @@ async def GetPlacements(self, request, context): # The second check makes sure that the task graph is indeed cancelled. # We have this additional guard because at the start all tasks are # VIRTUAL and we don't want to terminate the application then. + + if r.task_graph.is_cancelled(): + self._logger.error(f"[{stime}] Task graph '{r.task_graph.name}' is in state cancelled.") should_terminate = all( task.state @@ -645,13 +648,19 @@ async def GetPlacements(self, request, context): for task in r.task_graph ) and (r.task_graph.is_cancelled()) if should_terminate: - msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled. No more placements to provide." + msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled and simulator has processed all released/ scheduled tasks. Terminating it since it has no more placements to provide." self._logger.error(msg) return erdos_scheduler_pb2.GetPlacementsResponse( success=True, message=msg, terminate=True, ) + elif r.task_graph.is_cancelled() and not should_terminate: + msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled but simulator is still processing some released/ scheduled tasks. Will provide placements." + self._logger.error(msg) + else: + msg = f"[{stime}] Task graph '{r.task_graph.name}' is actively running. Will provide placements." + self._logger.info(msg) with self._lock: sim_placements = self._simulator.get_current_placements_for_task_graph( @@ -687,10 +696,12 @@ async def GetPlacements(self, request, context): "cores": cores, }, ) - + + msg = f"[{stime}] Returning the following placements {placements} for task graph '{request.id}'." + self._logger.info(msg) return erdos_scheduler_pb2.GetPlacementsResponse( success=True, - message=f"Placements for task graph '{request.id}' returned successfully", + message=msg, placements=placements, ) From e2636f6facd11161dc16ba3fff24a1bbc5e9b9ab Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Mon, 2 Dec 2024 23:49:46 -0500 Subject: [PATCH 100/128] reorder event queue priority to process scheduler events before task placement --- simulator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/simulator.py b/simulator.py index d80335fe..882526e2 100644 --- a/simulator.py +++ b/simulator.py @@ -38,9 +38,9 @@ class EventType(Enum): TASK_PREEMPT = 7 # Ask the simulator to preempt a task. TASK_MIGRATION = 8 # Ask the simulator to migrate a task. LOAD_PROFILE = 9 # Ask the simulator to load a profile into the WorkerPool. - TASK_PLACEMENT = 10 # Ask the simulator to place a task. - SCHEDULER_START = 11 # Requires the simulator to invoke the scheduler. - SCHEDULER_FINISHED = 12 # Signifies the end of the scheduler loop. + SCHEDULER_START = 10 # Requires the simulator to invoke the scheduler. + SCHEDULER_FINISHED = 11 # Signifies the end of the scheduler loop. + TASK_PLACEMENT = 12 # Ask the simulator to place a task. SIMULATOR_END = 13 # Signify the end of the simulator loop. LOG_UTILIZATION = 14 # Ask the simulator to log worker pool utilization. LOG_STATS = 15 # Log simulator statistics From eeb48544091f8da75bf365483334b04b960515d6 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 3 Dec 2024 10:22:25 -0500 Subject: [PATCH 101/128] improvements to experiment runner --- rpc/launch_tpch_queries.py | 2 + scripts/run_service_experiments.py | 89 +++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 19 deletions(-) diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py index 73fd3a44..47965d1b 100644 --- a/rpc/launch_tpch_queries.py +++ b/rpc/launch_tpch_queries.py @@ -225,6 +225,7 @@ def main(): query_number = rng.randint(1, 22) ps.append(launch_query(query_number, args)) print( + f"({i+1}/{len(release_times)})", "Current time: ", time.strftime("%Y-%m-%d %H:%M:%S"), " launching query: ", @@ -238,6 +239,7 @@ def main(): stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel) response = stub.Shutdown(erdos_scheduler_pb2.Empty()) channel.close() + print("Sent shutdown signal to the service") if __name__ == "__main__": diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index a3318f1e..92bde625 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -9,17 +9,17 @@ TPCH_SPARK_PATH = str(Path("../tpch-spark").resolve()) -def bang(cmd, dry_run): +def bang(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): cmd = [str(part) for part in cmd] print(" ".join(cmd)) if dry_run: return - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(cmd, stdout=stdout, stderr=stderr) return p -def must(cmd, dry_run): - p = bang(cmd, dry_run) +def must(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): + p = bang(cmd, dry_run, stdout, stderr) if p.wait() != 0: stdout, stderr = p.communicate() raise Exception(f"Command failed. stdout: {stdout}. stderr: {stderr}.") @@ -101,18 +101,25 @@ def __exit__(self, type, value, traceback): @dataclass class Launcher: launcher_args: any + output_dir: Path dry_run: bool def launch(self): - must( - [ - *("python3", "-m", "rpc.launch_tpch_queries"), - *self.launcher_args, - *("--spark-mirror-path", SPARK_MIRROR_PATH), - *("--tpch-spark-path", TPCH_SPARK_PATH), - ], - self.dry_run, - ) + with ( + open(self.output_dir / "launcher.stdout", "w") as f_out, + open(self.output_dir / "launcher.stderr", "w") as f_err, + ): + must( + [ + *("python3", "-u", "-m", "rpc.launch_tpch_queries"), + *self.launcher_args, + *("--spark-mirror-path", SPARK_MIRROR_PATH), + *("--tpch-spark-path", TPCH_SPARK_PATH), + ], + self.dry_run, + stdout=f_out, + stderr=f_err, + ) @dataclass @@ -132,7 +139,7 @@ def run(self): output_dir=output_dir, dry_run=self.args.dry_run, ) as s: - Launcher(self.launcher_args, self.args.dry_run).launch() + Launcher(self.launcher_args, output_dir, self.args.dry_run).launch() def main(): @@ -148,16 +155,60 @@ def main(): if not args.output_dir.exists(): args.output_dir.mkdir(parents=True) + base_args = [ + "--enforce_deadlines", + "--override_worker_cpu_count", + ] + variance_args = [ + *("--min_deadline_variance", 10), + *("--max_deadline_variance", 25), + ] + edf_args = [ + *("--scheduler", "EDF"), + ] + dsched_args = [ + *("--scheduler", "TetriSched"), + "--release_taskgraphs", + *("--opt_passes", "CRITICAL_PATH_PASS"), + *("--opt_passes", "CAPACITY_CONSTRAINT_PURGE_PASS"), + *("--opt_passes", "DYNAMIC_DISCRETIZATION_PASS"), + "--retract_schedules", + *("--scheduler_max_occupancy_threshold", 0.999), + "--finer_discretization_at_prev_solution", + "--scheduler_selective_rescheduling", + *("--scheduler_reconsideration_period", 0.6), + *("--scheduler_time_discretization", 1), + *("--scheduler_max_time_discretization", 5), + *("--finer_discretization_window", 5), + *("--scheduler_plan_ahead_no_consideration_gap", 1), + ] experiments = [ Experiment( - name="testing", + name="edf-q300-hard", service_args=[ - "--enforce_deadlines", - "--override_worker_cpu_count", + *base_args, + *edf_args, + *variance_args, + ], + launcher_args=[ + *("--num_queries", 300), + *("--variable_arrival_rate", 0.052), ], - launcher_args=["--num_queries", 1], args=args, - ) + ), + Experiment( + name="dsched-q300-hard", + service_args=[ + *base_args, + *dsched_args, + *variance_args, + ], + launcher_args=[ + *("--num_queries", 300), + *("--variable_arrival_rate", 0.052), + ], + args=args, + ), ] for i, experiment in enumerate(experiments): From cb96c3e61a4eb31f1d51311ab9cc64cac57b102d Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 3 Dec 2024 14:44:40 -0500 Subject: [PATCH 102/128] fix profile path in tpch loader --- data/tpch_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 91280ec6..97c2522a 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -420,10 +420,10 @@ def make_release_policy( # TODO: make configurable TPCH_SUBDIR = "100g/" DECIMA_TPCH_DIR = ( - "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/decima/" + Path(__file__).resolve().parent / ".." / "profiles/workload/tpch/decima/" ) CLOUDLAB_TPCH_DIR = ( - "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/" + Path(__file__).resolve().parent / ".." / "profiles/workload/tpch/cloudlab/" ) From 7036fcf71484a59851034094fff7f2ae17d4aece Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Tue, 3 Dec 2024 14:45:06 -0500 Subject: [PATCH 103/128] add spark-master-ip flag --- rpc/launch_tpch_queries.py | 8 ++- scripts/run_service_experiments.py | 81 ++++++++++++++++++------------ 2 files changed, 55 insertions(+), 34 deletions(-) diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py index 47965d1b..04bff0ad 100644 --- a/rpc/launch_tpch_queries.py +++ b/rpc/launch_tpch_queries.py @@ -29,7 +29,7 @@ def launch_query(query_number, args): cmd = [ f"{args.spark_mirror_path.resolve()}/bin/spark-submit", *("--deploy-mode", "cluster"), - *("--master", "spark://130.207.125.81:7077"), + *("--master", f"spark://{args.spark_master_ip}:7077"), *("--conf", "'spark.port.maxRetries=132'"), *("--conf", "'spark.eventLog.enabled=true'"), *("--conf", f"'spark.eventLog.dir={args.spark_eventlog_dir.resolve()}'"), @@ -123,6 +123,12 @@ def main(): required=True, help="Path to spark-mirror repository", ) + parser.add_argument( + "--spark-master-ip", + type=str, + required=True, + help="IP address of node running Spark master", + ) parser.add_argument( "--tpch-spark-path", type=Path, diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index 92bde625..55e074d7 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -5,9 +5,6 @@ from pathlib import Path from dataclasses import dataclass -SPARK_MIRROR_PATH = str(Path("../spark_mirror").resolve()) -TPCH_SPARK_PATH = str(Path("../tpch-spark").resolve()) - def bang(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): cmd = [str(part) for part in cmd] @@ -29,6 +26,8 @@ def must(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): @dataclass class Service: service_args: any + spark_mirror_path: Path + spark_master_ip: str output_dir: Path dry_run: bool @@ -59,22 +58,22 @@ def __enter__(self): # launch spark master and worker self._master = must( [ - f"{SPARK_MIRROR_PATH}/sbin/start-master.sh", - *("--host", "130.207.125.81"), + f"{self.spark_mirror_path}/sbin/start-master.sh", + *("--host", self.spark_master_ip), *( "--properties-file", - f"{SPARK_MIRROR_PATH}/conf/spark-dg-config.conf", + f"{self.spark_mirror_path}/conf/spark-dg-config.conf", ), ], self.dry_run, ) self._worker = must( [ - f"{SPARK_MIRROR_PATH}/sbin/start-worker.sh", - "spark://130.207.125.81:7077", + f"{self.spark_mirror_path}/sbin/start-worker.sh", + f"spark://{self.spark_master_ip}:7077", *( "--properties-file", - f"{SPARK_MIRROR_PATH}/conf/spark-dg-config.conf", + f"{self.spark_mirror_path}/conf/spark-dg-config.conf", ), ], self.dry_run, @@ -90,9 +89,9 @@ def clean(self): if self._service: self._service.wait() if self._master: - must([f"{SPARK_MIRROR_PATH}/sbin/stop-master.sh"], self.dry_run) + must([f"{self.spark_mirror_path}/sbin/stop-master.sh"], self.dry_run) if self._worker: - must([f"{SPARK_MIRROR_PATH}/sbin/stop-worker.sh"], self.dry_run) + must([f"{self.spark_mirror_path}/sbin/stop-worker.sh"], self.dry_run) def __exit__(self, type, value, traceback): self.clean() @@ -101,6 +100,9 @@ def __exit__(self, type, value, traceback): @dataclass class Launcher: launcher_args: any + spark_mirror_path: Path + spark_master_ip: str + tpch_spark_path: Path output_dir: Path dry_run: bool @@ -113,8 +115,9 @@ def launch(self): [ *("python3", "-u", "-m", "rpc.launch_tpch_queries"), *self.launcher_args, - *("--spark-mirror-path", SPARK_MIRROR_PATH), - *("--tpch-spark-path", TPCH_SPARK_PATH), + *("--spark-master-ip", self.spark_master_ip), + *("--spark-mirror-path", self.spark_mirror_path), + *("--tpch-spark-path", self.tpch_spark_path), ], self.dry_run, stdout=f_out, @@ -127,19 +130,27 @@ class Experiment: name: str service_args: any launcher_args: any - args: any - def run(self): - output_dir = self.args.output_dir / self.name + def run(self, args): + output_dir = args.output_dir / self.name if not output_dir.exists(): output_dir.mkdir(parents=True) with Service( service_args=self.service_args, + spark_mirror_path=args.spark_mirror_path, + spark_master_ip=args.spark_master_ip, output_dir=output_dir, - dry_run=self.args.dry_run, + dry_run=args.dry_run, ) as s: - Launcher(self.launcher_args, output_dir, self.args.dry_run).launch() + Launcher( + launcher_args=self.launcher_args, + spark_mirror_path=args.spark_mirror_path, + spark_master_ip=args.spark_master_ip, + tpch_spark_path=args.tpch_spark_path, + output_dir=output_dir, + dry_run=args.dry_run, + ).launch() def main(): @@ -149,6 +160,24 @@ def main(): action="store_true", help="Prints commands that will be executed for each experiment", ) + parser.add_argument( + "--spark-mirror-path", + type=Path, + required=True, + help="Path to spark-mirror repository", + ) + parser.add_argument( + "--spark-master-ip", + type=str, + required=True, + help="IP address of node running Spark master", + ) + parser.add_argument( + "--tpch-spark-path", + type=Path, + required=True, + help="Path to TPC-H Spark repository", + ) parser.add_argument("--output-dir", type=Path, default=Path("exp-output")) args = parser.parse_args() @@ -183,19 +212,6 @@ def main(): *("--scheduler_plan_ahead_no_consideration_gap", 1), ] experiments = [ - Experiment( - name="edf-q300-hard", - service_args=[ - *base_args, - *edf_args, - *variance_args, - ], - launcher_args=[ - *("--num_queries", 300), - *("--variable_arrival_rate", 0.052), - ], - args=args, - ), Experiment( name="dsched-q300-hard", service_args=[ @@ -207,14 +223,13 @@ def main(): *("--num_queries", 300), *("--variable_arrival_rate", 0.052), ], - args=args, ), ] for i, experiment in enumerate(experiments): try: print(f"=== {experiment.name} ({i+1}/{len(experiments)}) ===") - experiment.run() + experiment.run(args) print("=== done ===") except Exception as e: print(traceback.format_exc()) From bd1631020846c5fbcf8c920b1bc8eecc5bc16415 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 4 Dec 2024 01:21:16 -0500 Subject: [PATCH 104/128] reinstate previous eventQueue priority order (task_placement before scheduer) --- simulator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/simulator.py b/simulator.py index 882526e2..d80335fe 100644 --- a/simulator.py +++ b/simulator.py @@ -38,9 +38,9 @@ class EventType(Enum): TASK_PREEMPT = 7 # Ask the simulator to preempt a task. TASK_MIGRATION = 8 # Ask the simulator to migrate a task. LOAD_PROFILE = 9 # Ask the simulator to load a profile into the WorkerPool. - SCHEDULER_START = 10 # Requires the simulator to invoke the scheduler. - SCHEDULER_FINISHED = 11 # Signifies the end of the scheduler loop. - TASK_PLACEMENT = 12 # Ask the simulator to place a task. + TASK_PLACEMENT = 10 # Ask the simulator to place a task. + SCHEDULER_START = 11 # Requires the simulator to invoke the scheduler. + SCHEDULER_FINISHED = 12 # Signifies the end of the scheduler loop. SIMULATOR_END = 13 # Signify the end of the simulator loop. LOG_UTILIZATION = 14 # Ask the simulator to log worker pool utilization. LOG_STATS = 15 # Log simulator statistics From b4aceebeb6fd21fb87d1e67b8b3117396e20a1f7 Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 4 Dec 2024 01:24:34 -0500 Subject: [PATCH 105/128] [simulator] Unschedule subtree rooted at task if task is unable to run at timestep --- simulator.py | 108 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 32 deletions(-) diff --git a/simulator.py b/simulator.py index d80335fe..f9b0dc55 100644 --- a/simulator.py +++ b/simulator.py @@ -1396,6 +1396,67 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: ), "Inconsistency in future placements." task_graph = workload.get_task_graph(task.task_graph) assert task_graph is not None, "Inconsistency in Task placement and Workload." + + # Subroutine to handle avoid automatic re-placement of tasks in the next timestep + # if they were unable to start either due to (i) parent task not finished or + # (ii) worker not ready. The sub-tree rooted at the task is unscheduled and will + # be placed again in the next run of the scheduler. + def unschedule_subtree_rooted_at_task(task): + # Find all dependent tasks rooted from given task to unschedule + def subtree_tasks_to_unschedule(task): + tasks_to_unschedule = [task] + for child_task in task_graph.get_children(task): + tasks_to_unschedule.extend(subtree_tasks_to_unschedule(child_task)) + return tasks_to_unschedule + + tasks_to_unschedule = subtree_tasks_to_unschedule(task) + self._logger.info("[%s] Going to unschedule tasks rooted from %s. " + "List of tasks that will be unscheduled are: %s", + event.time.time, + task, + tasks_to_unschedule) + for unschedule_task in tasks_to_unschedule: + if unschedule_task.id in self._future_placement_events: + future_placement_event = self._future_placement_events[ + unschedule_task.id + ] + if future_placement_event.time > event.time: + # Delete future event from event_queue and from future_placement_events + self._event_queue.remove_event(future_placement_event) + del self._future_placement_events[unschedule_task.id] + msg = ( + f"[{event.time.time}] Retrieved future placement event {future_placement_event} " + f"for task {unschedule_task} and removed it." + ) + self._logger.info(msg) + elif future_placement_event.time == event.time: + # Cannot delete from event_queue, as this event is likely being processed + del self._future_placement_events[unschedule_task.id] + msg = ( + f"[{event.time.time}] Removed future placement event {future_placement_event} " + f"for task {unschedule_task} at the same time." + ) + self._logger.info(msg) + else: + msg = ( + f"[{event.time.time}] Future placement event {future_placement_event} for task " + f"{unschedule_task} is in the past." + ) + self._logger.warning(msg) + + # Unschedule the task + unschedule_task.unschedule(event.time) + self._csv_logger.debug( + f"{event.time.time},TASK_UNSCHEDULED,{unschedule_task.name},{unschedule_task.timestamp}," + f"{unschedule_task.id},{unschedule_task.task_graph}" + ) + + self._logger.info( + "[%s] Finished unscheduling of task %s.", + event.time.time, + unschedule_task, + ) + if not task.is_ready_to_run(task_graph): if task.state == TaskState.CANCELLED or task_graph.is_cancelled(): # The Task was cancelled. Consume the event. @@ -1420,34 +1481,20 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: return else: # If the Task is not ready to run and wasn't cancelled, - # find the next possible time to try executing the task. - parent_completion_time = max( - parent.remaining_time for parent in task_graph.get_parents(task) - ) - next_placement_time = event.time + max( - parent_completion_time, - self._min_placement_push_duration, - ) - next_placement_event = Event( - event_type=event.event_type, - time=next_placement_time, - task=event.task, - placement=event.placement, - ) - event.placement._placement_time = next_placement_time - self._future_placement_events[task.id] = next_placement_event - self._event_queue.add_event(next_placement_event) + # unschedule the task and its subtree. self._logger.info( - "[%s] The Task %s was not ready to run, and has been pushed for " - "later placement at %s.", + "[%s] The Task %s was not ready to run. The task along with its " + "sub-tree will be unscheduled.", event.time.to(EventTime.Unit.US).time, task, - next_placement_time, ) self._csv_logger.debug( f"{event.time.time},TASK_NOT_READY,{task.name},{task.timestamp}," f"{task.id},{event.placement.worker_pool_id}" ) + + # Unschedule the task and its subtree rooted at this task. + unschedule_subtree_rooted_at_task(task) return # Initialize the task at the given placement time, and place it on # the WorkerPool. @@ -1484,26 +1531,23 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: task.id ] = event.placement else: - next_placement_time = event.time + self._min_placement_push_duration - next_placement_event = Event( - event_type=event.event_type, - time=next_placement_time, - task=event.task, - placement=event.placement, - ) - self._event_queue.add_event(next_placement_event) - self._future_placement_events[task.id] = next_placement_event + # If the placement was not successful, send the sub-tree of the taskgraph + # rooted at this task back to its previous state. It allows the scheduler + # to re-schedule in its next run. self._logger.warning( - "[%s] Task %s cannot be placed on worker %s, pushing placement to %s.", + "[%s] Task %s couldn't be placed on worker %s. The task along with its " + "sub-tree will be unscheduled.", event.time.time, task, - worker_pool, - next_placement_time, + event.placement.worker_pool_id, ) self._csv_logger.debug( f"{event.time.time},WORKER_NOT_READY,{task.name},{task.timestamp}," f"{task.id},{event.placement.worker_pool_id}" ) + + # Unschedule the task and its subtree rooted at this task. + unschedule_subtree_rooted_at_task(task) def __handle_task_migration(self, event: Event) -> None: """Handles the TASK_MIGRATION event. This event must be followed by a From ce582e072231c190a2bcaf4492cb13c756d2c92d Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 4 Dec 2024 01:26:01 -0500 Subject: [PATCH 106/128] [service] log line to track tasks that get delayed in execution --- rpc/service.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rpc/service.py b/rpc/service.py index 11040e41..55537d8c 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -753,6 +753,9 @@ async def NotifyTaskCompletion(self, request, context): self._logger.info( f"[{stime}] Adding event {task_finished_event} to the simulator's event queue" ) + if actual_task_completion_time < self.__stime(): + self._logger.error( + f"[{stime}] Task '{request.task_id}' of task graph '{r.task_graph.name}' had exceeded its runtime by {self.__stime() - actual_task_completion_time}") scheduler_start_event = Event( event_type=EventType.SCHEDULER_START, From 3831f4406c04d88d9169f34452da9dbd41eee946 Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 4 Dec 2024 11:05:41 -0500 Subject: [PATCH 107/128] sleep for some time before signalling shutdown --- rpc/launch_tpch_queries.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py index 04bff0ad..82ba4881 100644 --- a/rpc/launch_tpch_queries.py +++ b/rpc/launch_tpch_queries.py @@ -241,6 +241,9 @@ def main(): for p in ps: p.wait() + # Wait for some time before sending the shutdown signal + time.sleep(20) + channel = grpc.insecure_channel("localhost:50051") stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel) response = stub.Shutdown(erdos_scheduler_pb2.Empty()) From 2a748383a318670f12f453a59a2f7d907e97b30f Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 4 Dec 2024 11:06:23 -0500 Subject: [PATCH 108/128] hack analyze pipeline to work with tpch output --- analyze.py | 14 ++++++-------- data/csv_reader.py | 5 +++++ data/csv_types.py | 26 +++++++++++++++++++------- simulator.py | 43 +++++++++++++++++++++---------------------- 4 files changed, 51 insertions(+), 37 deletions(-) diff --git a/analyze.py b/analyze.py index a2219981..97933fbd 100644 --- a/analyze.py +++ b/analyze.py @@ -335,7 +335,7 @@ def analyze_resource_utilization( # Plotting defaults. # hatches = ['//', '--', '**'] # alphas = np.arange(0.2, 1.2, 0.2) - resource_color = {"GPU": "red", "CPU": "green"} + resource_color = {"Slot": "green"} # Worker Pool statistics worker_pool_stats = csv_reader.get_worker_pool_utilizations(scheduler_csv_file) @@ -1246,16 +1246,16 @@ def log_aggregate_stats( / sum(stat.resource_utilizations[resource]) for stat in worker_pool_stats ] - for resource in ("GPU", "CPU") + for resource in ("Slot",) } scheduler_invocations = csv_reader.get_scheduler_invocations(csv_file) placed_tasks = [ - scheduler_invocation.placed_tasks + scheduler_invocation.num_placed_tasks for scheduler_invocation in scheduler_invocations ] unplaced_tasks = [ - scheduler_invocation.unplaced_tasks + scheduler_invocation.num_unplaced_tasks for scheduler_invocation in scheduler_invocations ] @@ -1268,8 +1268,7 @@ def log_aggregate_stats( placement_delay, deadline_delay, stat_function(e2e_response_time), - stat_function(resource_uses["GPU"]), - stat_function(resource_uses["CPU"]), + stat_function(resource_uses["Slot"]), stat_function(placed_tasks), stat_function(unplaced_tasks), log_name, @@ -1288,8 +1287,7 @@ def log_aggregate_stats( "Placement", "Deadline", "JCT", - "GPU", - "CPU", + "Slot", "Placed", "Unplaced", "Log", diff --git a/data/csv_reader.py b/data/csv_reader.py index d4d0d1f4..b81e0767 100644 --- a/data/csv_reader.py +++ b/data/csv_reader.py @@ -63,6 +63,11 @@ def parse_events(self, readings: Mapping[str, Sequence[str]]): ) elif reading[1] == "UPDATE_WORKLOAD": simulator.total_tasks += int(reading[2]) + elif reading[1] == "LOG_STATS": + assert ( + simulator is not None + ), "No SIMULATOR_START found for a corresponding SIMULATOR_END." + simulator.update_stats(reading) elif reading[1] == "SIMULATOR_END": assert ( simulator is not None diff --git a/data/csv_types.py b/data/csv_types.py index 390cd0a6..851299f8 100644 --- a/data/csv_types.py +++ b/data/csv_types.py @@ -385,6 +385,18 @@ def __init__(self, csv_path: str, start_time: int, total_tasks: int = 0): self.scheduler_invocations: list[Scheduler] = [] self.task_graphs: dict[str, TaskGraph] = {} + def update_stats(self, csv_reading: str): + assert ( + csv_reading[1] == "LOG_STATS" + ), f"The event {csv_reading[1]} was not of type LOG_STATS." + self.finished_tasks = int(csv_reading[2]) + self.dropped_tasks = int(csv_reading[3]) + self.missed_deadlines = int(csv_reading[4]) + self.finished_task_graphs = int(csv_reading[5]) + self.dropped_taskgraphs = int(csv_reading[6]) + self.missed_taskgraphs = int(csv_reading[7]) + self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs + def update_finish(self, csv_reading: str): """Updates the values of the Simulator based on the SIMULATOR_END event from CSV. @@ -396,10 +408,10 @@ def update_finish(self, csv_reading: str): csv_reading[1] == "SIMULATOR_END" ), f"The event {csv_reading[1]} was not of type SIMULATOR_END." self.end_time = int(csv_reading[0]) - self.finished_tasks = int(csv_reading[2]) - self.dropped_tasks = int(csv_reading[3]) - self.missed_deadlines = int(csv_reading[4]) - self.finished_task_graphs = int(csv_reading[5]) - self.dropped_taskgraphs = int(csv_reading[6]) - self.missed_taskgraphs = int(csv_reading[7]) - self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs + # self.finished_tasks = int(csv_reading[2]) + # self.dropped_tasks = int(csv_reading[3]) + # self.missed_deadlines = int(csv_reading[4]) + # self.finished_task_graphs = int(csv_reading[5]) + # self.dropped_taskgraphs = int(csv_reading[6]) + # self.missed_taskgraphs = int(csv_reading[7]) + # self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs diff --git a/simulator.py b/simulator.py index f9b0dc55..6f3bf572 100644 --- a/simulator.py +++ b/simulator.py @@ -1694,35 +1694,34 @@ def is_source_task(task): # Add task graph entry in self._current_task_graph_placements to # track its task placements + # + # In addition to newly added task graphs, self._workload also + # contains all previously released task graphs. + # + # So, we guard the addition of the entry on two conditions: + # (1) The task graph doesn't have an entry (we don't want to + # nuke an existing one) + # (2) The task graph is not complete (we only keep the entry + # alive while the task graph is running to avoid a memory + # leak) for task_graph_name, task_graph in self._workload.task_graphs.items(): - # In addition to newly added task graphs, self._workload also - # contains all previously released task graphs. - # - # So, we guard the addition of the entry on two conditions: - # (1) The task graph doesn't have an entry (we don't want to - # nuke an existing one) - # (2) The task graph is not complete (we only keep the entry - # alive while the task graph is running to avoid a memory - # leak) if ( task_graph_name not in self._current_task_graph_placements and not task_graph.is_complete() ): self._current_task_graph_placements[task_graph_name] = {} - # # Add the TaskGraphRelease events into the system. - # for task_graph_name, task_graph in self._workload.task_graphs.items(): - # event = Event( - # event_type=EventType.TASK_GRAPH_RELEASE, - # time=task_graph.release_time, - # task_graph=task_graph_name, - # ) - # self._event_queue.add_event(event) - # self._logger.info( - # "[%s] Added %s to the event queue.", - # self._simulator_time.to(EventTime.Unit.US).time, - # event, - # ) + event = Event( + event_type=EventType.TASK_GRAPH_RELEASE, + time=task_graph.release_time, + task_graph=task_graph_name, + ) + self._event_queue.add_event(event) + self._logger.info( + "[%s] Added %s to the event queue.", + self._simulator_time.to(EventTime.Unit.US).time, + event, + ) max_release_time = self._simulator_time for task in releasable_tasks: From f4bbe6afb88215a94641ee7cb6efcda4fb8b3f3d Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 4 Dec 2024 12:29:50 -0500 Subject: [PATCH 109/128] [simulator] check task state before invoking unschedule on it --- simulator.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/simulator.py b/simulator.py index 6f3bf572..e55f1665 100644 --- a/simulator.py +++ b/simulator.py @@ -1445,17 +1445,22 @@ def subtree_tasks_to_unschedule(task): self._logger.warning(msg) # Unschedule the task - unschedule_task.unschedule(event.time) - self._csv_logger.debug( - f"{event.time.time},TASK_UNSCHEDULED,{unschedule_task.name},{unschedule_task.timestamp}," - f"{unschedule_task.id},{unschedule_task.task_graph}" - ) - - self._logger.info( - "[%s] Finished unscheduling of task %s.", - event.time.time, - unschedule_task, - ) + if unschedule_task.state == TaskState.SCHEDULED: + unschedule_task.unschedule(event.time) + self._csv_logger.debug( + f"{event.time.time},TASK_UNSCHEDULED,{unschedule_task.name},{unschedule_task.timestamp}," + f"{unschedule_task.id},{unschedule_task.task_graph}" + ) + msg = ( + f"[{event.time.time}] Finished unscheduling of task {unschedule_task}." + ) + self._logger.info(msg) + else: + msg = ( + f"[{event.time.time}] Task {unschedule_task} was not in SCHEDULED state and was in " + f"{unschedule_task.state} state. Skip unscheduling." + ) + self._logger.info(msg) if not task.is_ready_to_run(task_graph): if task.state == TaskState.CANCELLED or task_graph.is_cancelled(): From b958a8a7445847006d89570bccc71b7e3162812f Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Wed, 4 Dec 2024 18:18:43 -0500 Subject: [PATCH 110/128] add support for tpch query partitioning --- data/tpch_loader.py | 103 ++++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 97c2522a..8d04198a 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Optional, Callable, Tuple from pathlib import Path +from enum import Enum import absl import numpy as np @@ -28,6 +29,12 @@ from .base_workload_loader import BaseWorkloadLoader +class TpchQueryDifficulty(Enum): + easy = {1, 3, 4, 6, 12, 14, 17, 19, 22} + medium = {10, 11, 13, 15, 16, 18, 20} + hard = {2, 7, 8, 9, 21} + + class TpchLoader: """Construct TPC-H task graph from a query profile @@ -286,64 +293,104 @@ def __init__(self, flags: "absl.flags") -> None: # Instantiate tpch loader self._tpch_loader = TpchLoader(path=flags.tpch_query_dag_spec, flags=flags) - # Gather release times - release_policy = self.__make_release_policy() - release_times = release_policy.get_release_times( - completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) - ) + # Intialize [(query_num, release_time)] + self._query_nums_and_release_times = [] + if len(flags.override_num_invocations) > 0: + # One each for easy, medium, and hard + assert len(flags.override_num_invocations) == len(TpchQueryDifficulty) + assert len(flags.override_poisson_arrival_rates) == len( + flags.override_num_invocations + ) - # Sample queries to be released - query_nums = [ - self._rng.randint(1, self._tpch_loader.num_queries) - for _ in range(self._flags.override_num_invocation) - ] + # only works with poisson distribution + assert flags.override_release_policy == "poisson" + + for i, part in enumerate(TpchQueryDifficulty): + print(flags.override_poisson_arrival_rates[i]) + release_policy = self.__make_release_policy( + policy_type=flags.override_release_policy, + arrival_rate=float(flags.override_poisson_arrival_rates[i]), + num_invocations=int(flags.override_num_invocations[i]), + ) + release_times = release_policy.get_release_times( + completion_time=EventTime( + self._flags.loop_timeout, EventTime.Unit.US + ) + ) + query_nums = [ + self._rng.choice(list(part.value)) + for _ in range(int(flags.override_num_invocations[i])) + ] + self._query_nums_and_release_times.extend( + list(zip(query_nums, release_times)) + ) + + self._query_nums_and_release_times.sort(key=lambda x: x[1]) + else: + release_policy = self.__make_release_policy() + release_times = release_policy.get_release_times( + completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) + ) + query_nums = [ + self._rng.randint(1, self._tpch_loader.num_queries) + for _ in range(self._flags.override_num_invocation) + ] + self._query_nums_and_release_times.extend( + list(zip(query_nums, release_times)) + ) - self._query_nums_and_release_times = list(zip(query_nums, release_times)) self._current_release_pointer = 0 # Initialize workload self._workload = Workload.empty(flags) - def __make_release_policy(self): + def __make_release_policy( + self, policy_type=None, arrival_rate=None, num_invocations=None + ): + if policy_type is None: + policy_type = self._flags.override_release_policy + if arrival_rate is None: + arrival_rate = self._flags.override_poisson_arrival_rate + if num_invocations is None: + num_invocations = self._flags.override_num_invocation + release_policy_args = {} - if self._flags.override_release_policy == "periodic": + if policy_type == "periodic": release_policy_args = { "period": EventTime( self._flags.override_arrival_period, EventTime.Unit.US ), } - elif self._flags.override_release_policy == "fixed": + elif policy_type == "fixed": release_policy_args = { "period": EventTime( self._flags.override_arrival_period, EventTime.Unit.US ), - "num_invocations": self._flags.override_num_invocation, + "num_invocations": num_invocations, } - elif self._flags.override_release_policy == "poisson": + elif policy_type == "poisson": release_policy_args = { - "rate": self._flags.override_poisson_arrival_rate, - "num_invocations": self._flags.override_num_invocation, + "rate": arrival_rate, + "num_invocations": num_invocations, } - elif self._flags.override_release_policy == "gamma": + elif policy_type == "gamma": release_policy_args = { - "rate": self._flags.override_poisson_arrival_rate, - "num_invocations": self._flags.override_num_invocation, + "rate": arrival_rate, + "num_invocations": num_invocations, "coefficient": self._flags.override_gamma_coefficient, } - elif self._flags.override_release_policy == "fixed_gamma": + elif policy_type == "fixed_gamma": release_policy_args = { - "variable_arrival_rate": self._flags.override_poisson_arrival_rate, + "variable_arrival_rate": arrival_rate, "base_arrival_rate": self._flags.override_base_arrival_rate, - "num_invocations": self._flags.override_num_invocation, + "num_invocations": num_invocations, "coefficient": self._flags.override_gamma_coefficient, } else: - raise NotImplementedError( - f"Release policy {self._flags.override_release_policy} not implemented." - ) + raise NotImplementedError(f"Release policy {policy_type} not implemented.") return make_release_policy( - self._flags.override_release_policy, + policy_type, release_policy_args, self._rng, self._rng_seed, From 038cc7f778286d2758929fef496c18d3d93ceb2f Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:13:23 -0500 Subject: [PATCH 111/128] run_service_experiments: Log service stdout/stderr --- scripts/run_service_experiments.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index 55e074d7..f5dbbee8 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -40,15 +40,21 @@ def __enter__(self): csv_file = self.output_dir / "service.csv" # launch service - self._service = bang( - [ - *("python3", "-m", "rpc.service"), - *("--log", log_file), - *("--csv_file_name", csv_file), - *self.service_args, - ], - self.dry_run, - ) + with ( + open(self.output_dir / "service.stdout", "w") as f_out, + open(self.output_dir / "service.stderr", "w") as f_err, + ): + self._service = bang( + [ + *("python3", "-m", "rpc.service"), + *("--log_file_name", log_file), + *("--csv_file_name", csv_file), + *self.service_args, + ], + self.dry_run, + stdout=f_out, + stderr=f_err + ) # sleep for some time if not self.dry_run: From 6f84681e0d1af283bfbc5ab9559310bf634adbf0 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:14:24 -0500 Subject: [PATCH 112/128] run_service_experiments.py: Fix --dry_run --- scripts/run_service_experiments.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index f5dbbee8..15ad51ab 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -17,10 +17,11 @@ def bang(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): def must(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): p = bang(cmd, dry_run, stdout, stderr) - if p.wait() != 0: - stdout, stderr = p.communicate() - raise Exception(f"Command failed. stdout: {stdout}. stderr: {stderr}.") - return p + if not dry_run: + if p.wait() != 0: + stdout, stderr = p.communicate() + raise Exception(f"Command failed. stdout: {stdout}. stderr: {stderr}.") + return p @dataclass From 968b158abb53f876b1199d4bb8c3e460c85b511d Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:25:35 -0500 Subject: [PATCH 113/128] run_service_experiments: Timestamp results folder --- scripts/run_service_experiments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index 15ad51ab..67f86241 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -4,6 +4,7 @@ import traceback from pathlib import Path from dataclasses import dataclass +from datetime import datetime def bang(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): @@ -139,7 +140,7 @@ class Experiment: launcher_args: any def run(self, args): - output_dir = args.output_dir / self.name + output_dir = args.output_dir / (self.name + '-' + datetime.now().isoformat()) if not output_dir.exists(): output_dir.mkdir(parents=True) From 51d6f467fa10c71c78e82c8709f43949122c00c1 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:27:05 -0500 Subject: [PATCH 114/128] run_service_experiments: Fix hang on exception Fixes an issue where if start-master.sh or start-worker.sh exits with a nonzero code, or more generally if an exception happens in Service.__enter__(), run_service_experiments.py hangs and doesn't report the exception. --- scripts/run_service_experiments.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index 67f86241..17ef3ff3 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -93,9 +93,14 @@ def __enter__(self): if not self.dry_run: time.sleep(5) + return self + + def wait(self): + self._service.wait() + def clean(self): if self._service: - self._service.wait() + self._service.terminate() if self._master: must([f"{self.spark_mirror_path}/sbin/stop-master.sh"], self.dry_run) if self._worker: @@ -159,6 +164,7 @@ def run(self, args): output_dir=output_dir, dry_run=args.dry_run, ).launch() + s.wait() def main(): From 3491d59af649bd3f1244cf69ac2b1d41a00afead Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sat, 14 Dec 2024 18:23:48 -0500 Subject: [PATCH 115/128] Spark service: Correctly log stats on shutdown When the last application is deregistered from the spark service, execute all remaining events from the simulator. This allows the final LOG_STATS event to be processed so we can calculate the SLO attainment. Unlike normal runs of the simulator, a SIMULATOR_END event is not inserted as some tasks might not have finished in the simulator and it's unclear when they will finish. The simulator is patched to allow an empty event queue in Simulator.simulate(). --- rpc/service.py | 17 +++++++++++++---- simulator.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 55537d8c..79425db5 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -232,7 +232,8 @@ def __init__(self, server) -> None: self._registered_app_drivers = ( {} ) # Spark driver id differs from taskgraph name (application id) - self._shutdown = False + self._received_shutdown = False + self._shutting_down = False self._lock = threading.Lock() super().__init__() @@ -361,7 +362,15 @@ async def DeregisterDriver(self, request, context): msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}" self._logger.info(msg) - if len(self._registered_app_drivers) == 0 and self._shutdown: + if len(self._registered_app_drivers) == 0 and self._received_shutdown: + self._logger.info(f"[{stime}] The last driver has been deregistered; finishing simulation") + # Signals _tick_simulator() to stop. Shouldn't be + # necessary in principle because after the with block + # ends, there shouldn't be any more events left to run, + # but doesn't hurt. + self._shutting_down = True + with self._lock: + self._simulator.simulate() await self._server.stop(0) return erdos_scheduler_pb2.DeregisterDriverResponse( @@ -777,11 +786,11 @@ async def NotifyTaskCompletion(self, request, context): ) async def Shutdown(self, request, context): - self._shutdown = True + self._received_shutdown = True return erdos_scheduler_pb2.Empty() async def _tick_simulator(self): - while True: + while not self._shutting_down: with self._lock: if self._simulator is not None: stime = self.__stime() diff --git a/simulator.py b/simulator.py index e55f1665..6de1cee2 100644 --- a/simulator.py +++ b/simulator.py @@ -517,7 +517,7 @@ def f(): step_size = time_until_next_event else: step_size = time_until_next_event - return step_size + return None if time_until_next_event.is_invalid() else step_size self.__simulate_f(should_step=f) From 727570260c03337f9c5f43ea0276fcf112cd585a Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sun, 15 Dec 2024 13:36:09 -0500 Subject: [PATCH 116/128] Set correct completion time for finishing Tasks On a TASK_FINISH event, set the task completion time to the time of the event rather than the last time the task was stepped. Resolves a bug in the service where tasks that finish later than the simulator's profiled runtime predicts get assigned the wrong completion time. --- simulator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simulator.py b/simulator.py index 6de1cee2..a752c58f 100644 --- a/simulator.py +++ b/simulator.py @@ -1232,7 +1232,7 @@ def __handle_task_finished(self, event: Event) -> None: # Remove the task from it's task graph's current placements del self._current_task_graph_placements[event.task.task_graph][event.task.id] - event.task.finish() + event.task.finish(event.time) # Log the TASK_FINISHED event into the CSV. self._finished_tasks += 1 From 7e19fd1b624fa7517e300bc33af52eefdb064d1c Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Sat, 11 Jan 2025 09:57:35 -0500 Subject: [PATCH 117/128] remove extraneous print --- data/tpch_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data/tpch_loader.py b/data/tpch_loader.py index 8d04198a..57ea6816 100644 --- a/data/tpch_loader.py +++ b/data/tpch_loader.py @@ -306,7 +306,6 @@ def __init__(self, flags: "absl.flags") -> None: assert flags.override_release_policy == "poisson" for i, part in enumerate(TpchQueryDifficulty): - print(flags.override_poisson_arrival_rates[i]) release_policy = self.__make_release_policy( policy_type=flags.override_release_policy, arrival_rate=float(flags.override_poisson_arrival_rates[i]), From a22e40655c9c4b45c6601f96371bc5af41a1256c Mon Sep 17 00:00:00 2001 From: Elton Leander Pinto Date: Sun, 19 Jan 2025 15:17:37 -0500 Subject: [PATCH 118/128] fix non-determinism in deadlines --- rpc/service.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 79425db5..7f76df83 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -448,11 +448,12 @@ async def RegisterTaskGraph(self, request, context): success=False, message=msg, num_executors=0 ) + task_graph = job_graph.get_next_task_graph( + start_time=release_time, + _flags=FLAGS, + ) + def gen(release_time): - task_graph = job_graph.get_next_task_graph( - start_time=release_time, - _flags=FLAGS, - ) return task_graph, stage_id_mapping else: @@ -485,6 +486,10 @@ async def RegisterEnvironmentReady(self, request, context): r = self._registered_applications[request.id] r.generate_task_graph(stime) + for task in r.task_graph: + d = stime - task._release_time + task._release_time = stime + task.update_deadline(task.deadline+d) with self._lock: self._simulator._workload.add_task_graph(r.task_graph) From 1a43c1891b71e944e664cee0728d156ca538d1df Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sun, 19 Jan 2025 15:35:04 -0500 Subject: [PATCH 119/128] Revert "fix non-determinism in deadlines" This reverts commit a22e40655c9c4b45c6601f96371bc5af41a1256c. --- rpc/service.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/rpc/service.py b/rpc/service.py index 7f76df83..79425db5 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -448,12 +448,11 @@ async def RegisterTaskGraph(self, request, context): success=False, message=msg, num_executors=0 ) - task_graph = job_graph.get_next_task_graph( - start_time=release_time, - _flags=FLAGS, - ) - def gen(release_time): + task_graph = job_graph.get_next_task_graph( + start_time=release_time, + _flags=FLAGS, + ) return task_graph, stage_id_mapping else: @@ -486,10 +485,6 @@ async def RegisterEnvironmentReady(self, request, context): r = self._registered_applications[request.id] r.generate_task_graph(stime) - for task in r.task_graph: - d = stime - task._release_time - task._release_time = stime - task.update_deadline(task.deadline+d) with self._lock: self._simulator._workload.add_task_graph(r.task_graph) From 4aa31ed88ebff5b8bc5518476e6250daec2f1bf5 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sun, 19 Jan 2025 16:39:51 -0500 Subject: [PATCH 120/128] Ensure consistent deadlines between simulator and Spark (hack) We found that deadlines for task graphs weren't consistent between the simulator and Spark even with the same RNG seed being used, due to the fact that EventTime keeps a global RNG it uses for all of its fuzzing and both deadlines and runtime variances are fuzzed. Since in simulator runs, task deadlines are all calculated at the start and runtime variances are calculated later, and in Spark, task deadlines and runtime variances are calculated throughout the experiment lifecycle, different deadline variances are obtained between simulator and Spark runs on the same experiment. Our solution is to pass a unique RNG used just for calculating deadline variances to the fuzzer. This RNG is hardcoded with a seed of 42; this is fine for experiments but it should probably be changed to the random_seed command line flag. --- utils.py | 8 ++++++-- workload/jobs.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/utils.py b/utils.py index 7e8f2814..1fa8adb0 100644 --- a/utils.py +++ b/utils.py @@ -93,24 +93,28 @@ def to_unchecked(self, unit: Unit) -> Tuple[float, Unit]: return self.time * self.unit.to(unit), unit def fuzz( - self, variance: Tuple[int, int], bounds: Tuple[int, int] = (0, sys.maxsize) + self, variance: Tuple[int, int], bounds: Tuple[int, int] = (0, sys.maxsize), rng: random.Random = None ) -> "EventTime": """Fuzz the time according to the provided `variance` and within the bounds. Args: variance (`Tuple[int, int]`): The (minimum, maximum) % variance to fuzz by. bounds (`Tuple[int, int]`): The (minimum, maximum) bounds to fuzz within. + rng (random.Random): The random number generator to use. Defaults to an internal RNG if none is specified. Returns: The fuzzed time according to the given variance. """ + if rng is None: + rng = type(self)._rng + min_variance, max_variance = variance min_bound, max_bound = bounds fuzzed_time = max( min_bound, min( max_bound, - type(self)._rng.uniform( + rng.uniform( self.time * abs(min_variance) / 100.0, self.time * abs(max_variance) / 100.0, ), diff --git a/workload/jobs.py b/workload/jobs.py index 8d7a90d3..0195f1d1 100644 --- a/workload/jobs.py +++ b/workload/jobs.py @@ -200,6 +200,12 @@ class JobGraph(Graph[Job]): "fixed_gamma", ) + # !HACK! This random number generator is used exclusively to + # calculate deadlines. This ensures that deadlines are consistent + # across runs of the simulator in different contexts where RNGs + # are called in different orders. + _deadline_rng = random.Random(42) + class ReleasePolicyType(Enum): """Represents the different release policies supported by a JobGraph.""" @@ -813,7 +819,7 @@ def _generate_task_graph( # NOTE: The taskgraph deadline is re-generated (and overwritten) after # use_branch_predicated_deadlines code, since fuzz is invoked again there. task_deadline = release_time + self.completion_time.fuzz( - deadline_variance, deadline_bounds + deadline_variance, deadline_bounds, log=task_logger, rng=type(self).deadline_rng ) # Generate all the `Task`s from the `Job`s in the graph. @@ -889,7 +895,7 @@ def _generate_task_graph( # NOTE: This is the second time the deadline is being set, based on a second # invocation of fuzz. task_graph_deadline = release_time + weighted_task_graph_length.fuzz( - deadline_variance, deadline_bounds + deadline_variance, deadline_bounds, log=task_logger, rng=type(self).deadline_rng ) if _flags and _flags.decompose_deadlines: stages_info = {} From 337de56fe6547019cd532513227d035dfdd8718b Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sun, 19 Jan 2025 16:47:38 -0500 Subject: [PATCH 121/128] run_service_experiments: print args to service and launcher --- scripts/run_service_experiments.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py index 17ef3ff3..5c540b68 100644 --- a/scripts/run_service_experiments.py +++ b/scripts/run_service_experiments.py @@ -148,6 +148,10 @@ def run(self, args): output_dir = args.output_dir / (self.name + '-' + datetime.now().isoformat()) if not output_dir.exists(): output_dir.mkdir(parents=True) + with open(output_dir / "service.args", "w") as f: + print(*self.service_args, sep='\n', file=f) + with open(output_dir / "launcher.args", "w") as f: + print(*self.launcher_args, sep='\n', file=f) with Service( service_args=self.service_args, From ed9f03b565562f1976e8afc716d1d40b79fb30e8 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Sun, 19 Jan 2025 17:13:19 -0500 Subject: [PATCH 122/128] Fix some minor errors with deadline fuzzer --- workload/jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workload/jobs.py b/workload/jobs.py index 0195f1d1..0e77be69 100644 --- a/workload/jobs.py +++ b/workload/jobs.py @@ -819,7 +819,7 @@ def _generate_task_graph( # NOTE: The taskgraph deadline is re-generated (and overwritten) after # use_branch_predicated_deadlines code, since fuzz is invoked again there. task_deadline = release_time + self.completion_time.fuzz( - deadline_variance, deadline_bounds, log=task_logger, rng=type(self).deadline_rng + deadline_variance, deadline_bounds, rng=type(self)._deadline_rng ) # Generate all the `Task`s from the `Job`s in the graph. @@ -895,7 +895,7 @@ def _generate_task_graph( # NOTE: This is the second time the deadline is being set, based on a second # invocation of fuzz. task_graph_deadline = release_time + weighted_task_graph_length.fuzz( - deadline_variance, deadline_bounds, log=task_logger, rng=type(self).deadline_rng + deadline_variance, deadline_bounds, rng=type(self)._deadline_rng ) if _flags and _flags.decompose_deadlines: stages_info = {} From 76632ffb0d33b6544b21ce64bca907e0ccdf8f72 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Tue, 21 Jan 2025 12:48:48 -0500 Subject: [PATCH 123/128] Spark service: Make job graph names the same as in TpchLoader Originally, the service named job graphs in the form Q[], where Spark sets the app id to app--, while TPC-H data loader named job graphs in the form Q[]. This commit changes how the service names job graphs by passing the index as an argument to the TpchQuery Spark application, which will then be forwarded into the Servicer through RegisterTaskGraph as a part of the query name. RegisterTaskGraph then uses the index to name the job graph. This ensures that the job graph names are always the same between a Spark run and a simulator run, irrespective of when the task graphs are actually released during a Spark run (which can be nondeterministic). The intent is to use these names to generate deadlines for the task graphs, so that deadlines are always consistent between Spark and simulator runs. This change requires a corresponding change to tpch-spark to forward the index to the Servicer. --- rpc/launch_tpch_queries.py | 5 +++-- rpc/service.py | 37 ++++++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py index 82ba4881..2d0f91bf 100644 --- a/rpc/launch_tpch_queries.py +++ b/rpc/launch_tpch_queries.py @@ -23,7 +23,7 @@ def map_dataset_to_deadline(dataset_size): return mapping.get(dataset_size, 120) # Default to 120s if dataset size is NA -def launch_query(query_number, args): +def launch_query(query_number, index, args): deadline = map_dataset_to_deadline(args.dataset_size) cmd = [ @@ -43,6 +43,7 @@ def launch_query(query_number, args): *("--class", "'main.scala.TpchQuery'"), f"{args.tpch_spark_path.resolve()}/target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar", f"{query_number}", + f"{index}", f"{args.dataset_size}", f"{args.max_cores}", ] @@ -229,7 +230,7 @@ def main(): query_number = args.queries[i] else: query_number = rng.randint(1, 22) - ps.append(launch_query(query_number, args)) + ps.append(launch_query(query_number, i, args)) print( f"({i+1}/{len(release_times)})", "Current time: ", diff --git a/rpc/service.py b/rpc/service.py index 79425db5..f4a7d504 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -399,18 +399,28 @@ async def RegisterTaskGraph(self, request, context): if request.name.startswith("TPCH Query"): # Parse request name query_parts = request.name.split() - if len(query_parts) != 3 and len(query_parts) != 5: - msg = f"[{stime}] Invalid TPCH query request" - return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, message=msg, num_executors=0 - ) - query_num = int(query_parts[2]) - if len(query_parts) == 5: - dataset_size = int(query_parts[3]) - max_executors_per_job = int(query_parts[4]) - else: - dataset_size = FLAGS.tpch_dataset_size - max_executors_per_job = FLAGS.tpch_max_executors_per_job + match query_parts: + case _, _, query_num, index, dataset_size, max_executors_per_job: + query_num = int(query_num) + dataset_size = int(dataset_size) + max_executors_per_job = int(max_executors_per_job) + case _, _, query_num, dataset_size, max_executors_per_job: + query_num = int(query_num) + # default index counts up from 0; incorrect if + # Spark receives jobs out of order + index = str(len(self._registered_applications)) + dataset_size = int(dataset_size) + max_executors_per_job = int(max_executors_per_job) + case _, _, query_num: + query_num = int(query_num) + index = str(len(self._registered_applications)) + dataset_size = FLAGS.tpch_dataset_size + max_executors_per_job = FLAGS.tpch_max_executors_per_job + case _: + msg = f"[{stime}] Invalid TPCH query request" + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) # Convert request.dependencies to [{name: int, children: [int]}] dependencies = [] @@ -423,11 +433,12 @@ async def RegisterTaskGraph(self, request, context): ) # Create a job graph + self._logger.debug(str((query_num, index, dataset_size, max_executors_per_job))) try: job_graph, stage_id_mapping = self._data_loaders[ DataLoader.TPCH ].make_job_graph( - id=request.id, + id=index, query_num=query_num, dependencies=dependencies, dataset_size=dataset_size, From 6a1528b62df11b08eab808a8e8dbd354830c36d7 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Tue, 21 Jan 2025 13:41:28 -0500 Subject: [PATCH 124/128] Deterministic task graph deadlines based on task graph names --- workload/jobs.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/workload/jobs.py b/workload/jobs.py index 0e77be69..97b4966a 100644 --- a/workload/jobs.py +++ b/workload/jobs.py @@ -200,12 +200,6 @@ class JobGraph(Graph[Job]): "fixed_gamma", ) - # !HACK! This random number generator is used exclusively to - # calculate deadlines. This ensures that deadlines are consistent - # across runs of the simulator in different contexts where RNGs - # are called in different orders. - _deadline_rng = random.Random(42) - class ReleasePolicyType(Enum): """Represents the different release policies supported by a JobGraph.""" @@ -811,6 +805,14 @@ def _generate_task_graph( resolve_conditionals = False task_logger = setup_logging(name="Task") + # Create an RNG to be used when fuzzing deadlines, seeded by + # the TaskGraph name and the global random seed, if provided. + # This ensures that deadlines are deterministic, which is + # needed for simulator/Spark parity. + deadline_rng = random.Random( + (str(_flags.random_seed) if _flags else "") + task_graph_name + ) + # Generate the deadline for all the Tasks. # TODO (Sukrit): Right now, this assumes that all Tasks in the TaskGraph come # with the same deadline. At some point, we will have to implement a @@ -819,7 +821,7 @@ def _generate_task_graph( # NOTE: The taskgraph deadline is re-generated (and overwritten) after # use_branch_predicated_deadlines code, since fuzz is invoked again there. task_deadline = release_time + self.completion_time.fuzz( - deadline_variance, deadline_bounds, rng=type(self)._deadline_rng + deadline_variance, deadline_bounds, rng=deadline_rng ) # Generate all the `Task`s from the `Job`s in the graph. @@ -895,7 +897,7 @@ def _generate_task_graph( # NOTE: This is the second time the deadline is being set, based on a second # invocation of fuzz. task_graph_deadline = release_time + weighted_task_graph_length.fuzz( - deadline_variance, deadline_bounds, rng=type(self)._deadline_rng + deadline_variance, deadline_bounds, rng=deadline_rng ) if _flags and _flags.decompose_deadlines: stages_info = {} From 8fe4956146cd7c3bfdcc154025922f8c9af40e60 Mon Sep 17 00:00:00 2001 From: Rohan Bafna <130247393+rohanbafna@users.noreply.github.com> Date: Tue, 28 Jan 2025 20:42:29 -0500 Subject: [PATCH 125/128] TetriSchedScheduler: Reconsider tasks that couldn't be placed To avoid needless reruns, TetriSchedScheduler does not run the scheduler if there are no tasks which are not scheduled, not part of a task graph that has been previously considered, and not part of a task graph that has been cancelled. We remove this second condition to account for situations in which a task graph is considered and its tasks scheduled, but the tasks failed to be placed (for instance, if another task on the same worker finished late, taking up resources). In such cases, the task graph would not be cancelled and might still be able to be completed, so we need to run the scheduler again to try to schedule the tasks that could not be placed before. --- schedulers/tetrisched_scheduler.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/schedulers/tetrisched_scheduler.py b/schedulers/tetrisched_scheduler.py index 6cbeb425..3198faed 100644 --- a/schedulers/tetrisched_scheduler.py +++ b/schedulers/tetrisched_scheduler.py @@ -601,11 +601,9 @@ def schedule( # Construct the STRL expression. scheduler_start_time = time.time() if len(tasks_to_be_scheduled) > 0 and any( - # If there is a Task belonging to a TaskGraph that hasn't been previously - # considered for scheduling and belongs to a TaskGraph that hasn't been - # cancelled, then we run the scheduler. + # If there is a Task belonging to a TaskGraph that hasn't + # been cancelled, then we run the scheduler. task.state != TaskState.SCHEDULED - and task.task_graph not in self._previously_considered_task_graphs and task.task_graph not in cancelled_task_graphs for task in tasks_to_be_scheduled ): From f73f4cd46687d1969c65e3dc937ebd2b1f8cd57c Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Wed, 12 Feb 2025 17:15:09 -0500 Subject: [PATCH 126/128] tpch partitioning based on space-time analysis --- data/tpch_partitioning_analysis.ipynb | 1259 +++++++++++++++++++++++++ 1 file changed, 1259 insertions(+) create mode 100644 data/tpch_partitioning_analysis.ipynb diff --git a/data/tpch_partitioning_analysis.ipynb b/data/tpch_partitioning_analysis.ipynb new file mode 100644 index 00000000..b5ff4c07 --- /dev/null +++ b/data/tpch_partitioning_analysis.ipynb @@ -0,0 +1,1259 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"q1\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 18\n", + " ],\n", + " [\n", + " 1,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q2\": [\n", + " [\n", + " 0,\n", + " 100,\n", + " 14\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 146,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 9,\n", + " 2,\n", + " 12\n", + " ],\n", + " [\n", + " 10,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 11,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 12,\n", + " 13,\n", + " 12\n", + " ],\n", + " [\n", + " 13,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 14,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 15,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 16,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q3\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 18\n", + " ],\n", + " [\n", + " 1,\n", + " 133,\n", + " 17\n", + " ],\n", + " [\n", + " 2,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 139,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 139,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q4\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 17\n", + " ],\n", + " [\n", + " 1,\n", + " 133,\n", + " 14\n", + " ],\n", + " [\n", + " 2,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 141,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q5\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 19\n", + " ],\n", + " [\n", + " 1,\n", + " 133,\n", + " 13\n", + " ],\n", + " [\n", + " 2,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 200,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 138,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 140,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 9,\n", + " 2,\n", + " 12\n", + " ],\n", + " [\n", + " 10,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 11,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 12,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q6\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 14\n", + " ],\n", + " [\n", + " 1,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q7\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 15\n", + " ],\n", + " [\n", + " 1,\n", + " 133,\n", + " 14\n", + " ],\n", + " [\n", + " 2,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 140,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 137,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 149,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 9,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 10,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 11,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q8\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 20\n", + " ],\n", + " [\n", + " 1,\n", + " 133,\n", + " 15\n", + " ],\n", + " [\n", + " 2,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 200,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 140,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 9,\n", + " 146,\n", + " 12\n", + " ],\n", + " [\n", + " 10,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 11,\n", + " 66,\n", + " 12\n", + " ],\n", + " [\n", + " 12,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 13,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 14,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 15,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 16,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q9\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 18\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 17\n", + " ],\n", + " [\n", + " 2,\n", + " 133,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 200,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 138,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 143,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 144,\n", + " 12\n", + " ],\n", + " [\n", + " 9,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 10,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 11,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 12,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q10\": [\n", + " [\n", + " 0,\n", + " 133,\n", + " 13\n", + " ],\n", + " [\n", + " 1,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 141,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 137,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 144,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q11\": [\n", + " [\n", + " 0,\n", + " 100,\n", + " 16\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 40,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q12\": [\n", + " [\n", + " 0,\n", + " 133,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 162,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q13\": [\n", + " [\n", + " 0,\n", + " 133,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 149,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 66,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q14\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 142,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q15\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 139,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 138,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 138,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q16\": [\n", + " [\n", + " 0,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 112,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 139,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 143,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 66,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q17\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 94,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 200,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q18\": [\n", + " [\n", + " 0,\n", + " 133,\n", + " 16\n", + " ],\n", + " [\n", + " 1,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 200,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 100,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q19\": [\n", + " [\n", + " 0,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 1,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q20\": [\n", + " [\n", + " 0,\n", + " 100,\n", + " 14\n", + " ],\n", + " [\n", + " 1,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 140,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 2,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 5,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q21\": [\n", + " [\n", + " 0,\n", + " 133,\n", + " 12\n", + " ],\n", + " [\n", + " 1,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 593,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 137,\n", + " 12\n", + " ],\n", + " [\n", + " 5,\n", + " 144,\n", + " 12\n", + " ],\n", + " [\n", + " 6,\n", + " 140,\n", + " 12\n", + " ],\n", + " [\n", + " 7,\n", + " 199,\n", + " 12\n", + " ],\n", + " [\n", + " 8,\n", + " 126,\n", + " 12\n", + " ],\n", + " [\n", + " 9,\n", + " 143,\n", + " 12\n", + " ],\n", + " [\n", + " 10,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 11,\n", + " 1,\n", + " 12\n", + " ],\n", + " [\n", + " 12,\n", + " 35,\n", + " 12\n", + " ],\n", + " [\n", + " 13,\n", + " 6,\n", + " 12\n", + " ]\n", + " ],\n", + " \"q22\": [\n", + " [\n", + " 0,\n", + " 133,\n", + " 14\n", + " ],\n", + " [\n", + " 1,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 2,\n", + " 142,\n", + " 12\n", + " ],\n", + " [\n", + " 3,\n", + " 100,\n", + " 12\n", + " ],\n", + " [\n", + " 4,\n", + " 1,\n", + " 12\n", + " ]\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "import math\n", + "\n", + "def load_json(file_path):\n", + " with open(file_path, \"r\") as f:\n", + " return json.load(f)\n", + "\n", + "# NOTE: Done for only 100g dataset and maxCores 100\n", + "# TODO: Check if the buckets hold for other configs we want to use:\n", + "# maxCores=200? or maxCores=75?\n", + "\n", + "# NOTE: Also hardcodes min_task_runtime to 12s. But need to verify the\n", + "# task_runtime computed here is same as loader. Loader also does rectangular\n", + "# reshaping of tasks to stage runtime.\n", + "\n", + "def extract_tpch_data(json_data):\n", + " extracted_data = {}\n", + " \n", + " for query_key, stages in json_data.items():\n", + " if \"100g\" in query_key and \"maxCores_100\" in query_key:\n", + " query_id = query_key.split(\"_\")[1] # Extract query number (e.g., 'q1')\n", + " extracted_data[query_id] = [\n", + " (stage[\"stage_id\"], stage[\"num_tasks\"], math.ceil(max(12000, int(stage[\"average_runtime_ms\"]))/1000))\n", + " for stage in stages\n", + " ]\n", + " \n", + " return extracted_data\n", + "\n", + "if __name__ == \"__main__\":\n", + " file_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n", + " json_data = load_json(file_path)\n", + " result = extract_tpch_data(json_data)\n", + " print(json.dumps(result, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'q1': [(0, 593, 18), (1, 1, 12), (2, 1, 12)],\n", + " 'q2': [(0, 100, 14),\n", + " (1, 100, 12),\n", + " (2, 100, 12),\n", + " (3, 100, 12),\n", + " (4, 146, 12),\n", + " (5, 35, 12),\n", + " (6, 35, 12),\n", + " (7, 1, 12),\n", + " (8, 1, 12),\n", + " (9, 2, 12),\n", + " (10, 1, 12),\n", + " (11, 1, 12),\n", + " (12, 13, 12),\n", + " (13, 1, 12),\n", + " (14, 1, 12),\n", + " (15, 1, 12),\n", + " (16, 1, 12)],\n", + " 'q3': [(0, 593, 18), (1, 133, 17), (2, 100, 12), (3, 139, 12), (4, 139, 12)],\n", + " 'q4': [(0, 593, 17), (1, 133, 14), (2, 1, 12), (3, 141, 12), (4, 1, 12)],\n", + " 'q5': [(0, 593, 19),\n", + " (1, 133, 13),\n", + " (2, 1, 12),\n", + " (3, 200, 12),\n", + " (4, 100, 12),\n", + " (5, 138, 12),\n", + " (6, 35, 12),\n", + " (7, 140, 12),\n", + " (8, 1, 12),\n", + " (9, 2, 12),\n", + " (10, 1, 12),\n", + " (11, 1, 12),\n", + " (12, 1, 12)],\n", + " 'q6': [(0, 593, 14), (1, 1, 12)],\n", + " 'q7': [(0, 593, 15),\n", + " (1, 133, 14),\n", + " (2, 100, 12),\n", + " (3, 140, 12),\n", + " (4, 137, 12),\n", + " (5, 149, 12),\n", + " (6, 1, 12),\n", + " (7, 35, 12),\n", + " (8, 1, 12),\n", + " (9, 1, 12),\n", + " (10, 1, 12),\n", + " (11, 1, 12)],\n", + " 'q8': [(0, 593, 20),\n", + " (1, 133, 15),\n", + " (2, 1, 12),\n", + " (3, 100, 12),\n", + " (4, 200, 12),\n", + " (5, 100, 12),\n", + " (6, 140, 12),\n", + " (7, 1, 12),\n", + " (8, 35, 12),\n", + " (9, 146, 12),\n", + " (10, 1, 12),\n", + " (11, 66, 12),\n", + " (12, 1, 12),\n", + " (13, 1, 12),\n", + " (14, 1, 12),\n", + " (15, 1, 12),\n", + " (16, 1, 12)],\n", + " 'q9': [(0, 593, 18),\n", + " (1, 100, 17),\n", + " (2, 133, 12),\n", + " (3, 100, 12),\n", + " (4, 200, 12),\n", + " (5, 138, 12),\n", + " (6, 143, 12),\n", + " (7, 35, 12),\n", + " (8, 144, 12),\n", + " (9, 1, 12),\n", + " (10, 1, 12),\n", + " (11, 1, 12),\n", + " (12, 1, 12)],\n", + " 'q10': [(0, 133, 13),\n", + " (1, 593, 12),\n", + " (2, 100, 12),\n", + " (3, 141, 12),\n", + " (4, 1, 12),\n", + " (5, 137, 12),\n", + " (6, 1, 12),\n", + " (7, 144, 12)],\n", + " 'q11': [(0, 100, 16),\n", + " (1, 100, 12),\n", + " (2, 1, 12),\n", + " (3, 100, 12),\n", + " (4, 35, 12),\n", + " (5, 40, 12),\n", + " (6, 1, 12)],\n", + " 'q12': [(0, 133, 12), (1, 593, 12), (2, 162, 12), (3, 1, 12), (4, 1, 12)],\n", + " 'q13': [(0, 133, 12),\n", + " (1, 100, 12),\n", + " (2, 149, 12),\n", + " (3, 66, 12),\n", + " (4, 1, 12),\n", + " (5, 1, 12)],\n", + " 'q14': [(0, 593, 12), (1, 100, 12), (2, 142, 12)],\n", + " 'q15': [(0, 593, 12),\n", + " (1, 1, 12),\n", + " (2, 139, 12),\n", + " (3, 35, 12),\n", + " (4, 138, 12),\n", + " (5, 138, 12)],\n", + " 'q16': [(0, 100, 12),\n", + " (1, 35, 12),\n", + " (2, 100, 12),\n", + " (3, 112, 12),\n", + " (4, 139, 12),\n", + " (5, 1, 12),\n", + " (6, 143, 12),\n", + " (7, 66, 12)],\n", + " 'q17': [(0, 593, 12), (1, 593, 12), (2, 94, 12), (3, 200, 12)],\n", + " 'q18': [(0, 133, 16),\n", + " (1, 593, 12),\n", + " (2, 593, 12),\n", + " (3, 200, 12),\n", + " (4, 100, 12),\n", + " (5, 100, 12)],\n", + " 'q19': [(0, 593, 12), (1, 100, 12), (2, 1, 12)],\n", + " 'q20': [(0, 100, 14),\n", + " (1, 593, 12),\n", + " (2, 100, 12),\n", + " (3, 100, 12),\n", + " (4, 140, 12),\n", + " (5, 1, 12),\n", + " (6, 2, 12),\n", + " (7, 1, 12),\n", + " (8, 5, 12)],\n", + " 'q21': [(0, 133, 12),\n", + " (1, 593, 12),\n", + " (2, 593, 12),\n", + " (3, 593, 12),\n", + " (4, 137, 12),\n", + " (5, 144, 12),\n", + " (6, 140, 12),\n", + " (7, 199, 12),\n", + " (8, 126, 12),\n", + " (9, 143, 12),\n", + " (10, 1, 12),\n", + " (11, 1, 12),\n", + " (12, 35, 12),\n", + " (13, 6, 12)],\n", + " 'q22': [(0, 133, 14), (1, 100, 12), (2, 142, 12), (3, 100, 12), (4, 1, 12)]}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_resource_space(data):\n", + " resource_space = {}\n", + " for query_id, stages in data.items():\n", + " resource_space[query_id] = sum(num_tasks * runtime for _, num_tasks, runtime in stages)\n", + " return resource_space" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "query_resource_requirements = compute_resource_space(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'q1': 10698,\n", + " 'q2': 7868,\n", + " 'q3': 17471,\n", + " 'q4': 13659,\n", + " 'q5': 20436,\n", + " 'q6': 8314,\n", + " 'q7': 17549,\n", + " 'q8': 23395,\n", + " 'q9': 23138,\n", + " 'q10': 15133,\n", + " 'q11': 4924,\n", + " 'q12': 10680,\n", + " 'q13': 5400,\n", + " 'q14': 10020,\n", + " 'q15': 12528,\n", + " 'q16': 8352,\n", + " 'q17': 17760,\n", + " 'q18': 21160,\n", + " 'q19': 8328,\n", + " 'q20': 12704,\n", + " 'q21': 34128,\n", + " 'q22': 5978}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_resource_requirements" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "def bucketize_queries(resource_space):\n", + " buckets = {\"easy\": [], \"medium\": [], \"hard\": []}\n", + " for query_id, value in resource_space.items():\n", + " if value < 10000:\n", + " buckets[\"easy\"].append(query_id)\n", + " elif 10000 <= value <= 20000:\n", + " buckets[\"medium\"].append(query_id)\n", + " else:\n", + " buckets[\"hard\"].append(query_id)\n", + " return buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "buckets = bucketize_queries(query_resource_requirements)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q6', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q3', 'q4', 'q7', 'q10', 'q12', 'q14', 'q15', 'q17', 'q20'],\n", + " 'hard': ['q5', 'q8', 'q9', 'q18', 'q21']}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Retry with rectangular mapping?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dg_erdos", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From db90d0c37a9fc560f34ce8b05b367ae7feb667ba Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 20 Feb 2025 11:19:53 -0500 Subject: [PATCH 127/128] bucketize space-time for tpch-partitioning, cleaned up code --- data/tpch_partitioning_analysis.ipynb | 1319 ++++--------------------- 1 file changed, 175 insertions(+), 1144 deletions(-) diff --git a/data/tpch_partitioning_analysis.ipynb b/data/tpch_partitioning_analysis.ipynb index b5ff4c07..0e3d1f7d 100644 --- a/data/tpch_partitioning_analysis.ipynb +++ b/data/tpch_partitioning_analysis.ipynb @@ -2,1237 +2,268 @@ "cells": [ { "cell_type": "code", - "execution_count": 26, + "execution_count": 10, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"q1\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 18\n", - " ],\n", - " [\n", - " 1,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q2\": [\n", - " [\n", - " 0,\n", - " 100,\n", - " 14\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 146,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 9,\n", - " 2,\n", - " 12\n", - " ],\n", - " [\n", - " 10,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 11,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 12,\n", - " 13,\n", - " 12\n", - " ],\n", - " [\n", - " 13,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 14,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 15,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 16,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q3\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 18\n", - " ],\n", - " [\n", - " 1,\n", - " 133,\n", - " 17\n", - " ],\n", - " [\n", - " 2,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 139,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 139,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q4\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 17\n", - " ],\n", - " [\n", - " 1,\n", - " 133,\n", - " 14\n", - " ],\n", - " [\n", - " 2,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 141,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q5\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 19\n", - " ],\n", - " [\n", - " 1,\n", - " 133,\n", - " 13\n", - " ],\n", - " [\n", - " 2,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 200,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 138,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 140,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 9,\n", - " 2,\n", - " 12\n", - " ],\n", - " [\n", - " 10,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 11,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 12,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q6\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 14\n", - " ],\n", - " [\n", - " 1,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q7\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 15\n", - " ],\n", - " [\n", - " 1,\n", - " 133,\n", - " 14\n", - " ],\n", - " [\n", - " 2,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 140,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 137,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 149,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 9,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 10,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 11,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q8\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 20\n", - " ],\n", - " [\n", - " 1,\n", - " 133,\n", - " 15\n", - " ],\n", - " [\n", - " 2,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 200,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 140,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 9,\n", - " 146,\n", - " 12\n", - " ],\n", - " [\n", - " 10,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 11,\n", - " 66,\n", - " 12\n", - " ],\n", - " [\n", - " 12,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 13,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 14,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 15,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 16,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q9\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 18\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 17\n", - " ],\n", - " [\n", - " 2,\n", - " 133,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 200,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 138,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 143,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 144,\n", - " 12\n", - " ],\n", - " [\n", - " 9,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 10,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 11,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 12,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q10\": [\n", - " [\n", - " 0,\n", - " 133,\n", - " 13\n", - " ],\n", - " [\n", - " 1,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 141,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 137,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 144,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q11\": [\n", - " [\n", - " 0,\n", - " 100,\n", - " 16\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 40,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q12\": [\n", - " [\n", - " 0,\n", - " 133,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 162,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q13\": [\n", - " [\n", - " 0,\n", - " 133,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 149,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 66,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q14\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 142,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q15\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 139,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 138,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 138,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q16\": [\n", - " [\n", - " 0,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 112,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 139,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 143,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 66,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q17\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 94,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 200,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q18\": [\n", - " [\n", - " 0,\n", - " 133,\n", - " 16\n", - " ],\n", - " [\n", - " 1,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 200,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 100,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q19\": [\n", - " [\n", - " 0,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 1,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q20\": [\n", - " [\n", - " 0,\n", - " 100,\n", - " 14\n", - " ],\n", - " [\n", - " 1,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 140,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 2,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 5,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q21\": [\n", - " [\n", - " 0,\n", - " 133,\n", - " 12\n", - " ],\n", - " [\n", - " 1,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 593,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 137,\n", - " 12\n", - " ],\n", - " [\n", - " 5,\n", - " 144,\n", - " 12\n", - " ],\n", - " [\n", - " 6,\n", - " 140,\n", - " 12\n", - " ],\n", - " [\n", - " 7,\n", - " 199,\n", - " 12\n", - " ],\n", - " [\n", - " 8,\n", - " 126,\n", - " 12\n", - " ],\n", - " [\n", - " 9,\n", - " 143,\n", - " 12\n", - " ],\n", - " [\n", - " 10,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 11,\n", - " 1,\n", - " 12\n", - " ],\n", - " [\n", - " 12,\n", - " 35,\n", - " 12\n", - " ],\n", - " [\n", - " 13,\n", - " 6,\n", - " 12\n", - " ]\n", - " ],\n", - " \"q22\": [\n", - " [\n", - " 0,\n", - " 133,\n", - " 14\n", - " ],\n", - " [\n", - " 1,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 2,\n", - " 142,\n", - " 12\n", - " ],\n", - " [\n", - " 3,\n", - " 100,\n", - " 12\n", - " ],\n", - " [\n", - " 4,\n", - " 1,\n", - " 12\n", - " ]\n", - " ]\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "import json\n", - "import math\n", - "\n", + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ "def load_json(file_path):\n", + " \"\"\"Load JSON data from a file.\"\"\"\n", " with open(file_path, \"r\") as f:\n", - " return json.load(f)\n", + " return json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def recompute_tasks_runtime(stage_id, num_tasks, avg_runtime_ms, max_executors, min_task_runtime_ms):\n", + " \"\"\"Recompute the number of tasks and runtime per task while enforcing constraints.\"\"\"\n", + " profiled_runtime = math.ceil(avg_runtime_ms / 1000) # Convert ms to seconds\n", "\n", - "# NOTE: Done for only 100g dataset and maxCores 100\n", - "# TODO: Check if the buckets hold for other configs we want to use:\n", - "# maxCores=200? or maxCores=75?\n", + " if num_tasks > max_executors:\n", + " adjusted_runtime = math.ceil((num_tasks * profiled_runtime) / max_executors)\n", + " adjusted_num_tasks = max_executors\n", + " else:\n", + " adjusted_runtime = profiled_runtime\n", + " adjusted_num_tasks = num_tasks\n", "\n", - "# NOTE: Also hardcodes min_task_runtime to 12s. But need to verify the\n", - "# task_runtime computed here is same as loader. Loader also does rectangular\n", - "# reshaping of tasks to stage runtime.\n", + " final_runtime = max(math.ceil(min_task_runtime_ms / 1000), adjusted_runtime) # Enforce min runtime\n", "\n", - "def extract_tpch_data(json_data):\n", + " # print(\n", + " # f\"Stage {stage_id}: num_tasks ({num_tasks}) -> {adjusted_num_tasks}, \"\n", + " # f\"runtime_s ({profiled_runtime}) -> {final_runtime}\"\n", + " # )\n", + "\n", + " return adjusted_num_tasks, final_runtime" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_tpch_data(json_data, dataset_size, max_executors, min_task_runtime_ms):\n", + " \"\"\"Extract TPCH query data, filter relevant queries, and recompute runtime.\"\"\"\n", " extracted_data = {}\n", - " \n", + "\n", " for query_key, stages in json_data.items():\n", - " if \"100g\" in query_key and \"maxCores_100\" in query_key:\n", + " if dataset_size in query_key and \"maxCores_\" + str(max_executors) in query_key:\n", " query_id = query_key.split(\"_\")[1] # Extract query number (e.g., 'q1')\n", + " # print(f\"--------Processing query {query_id}\")\n", " extracted_data[query_id] = [\n", - " (stage[\"stage_id\"], stage[\"num_tasks\"], math.ceil(max(12000, int(stage[\"average_runtime_ms\"]))/1000))\n", + " (stage[\"stage_id\"], recompute_tasks_runtime(stage[\"stage_id\"], stage[\"num_tasks\"], int(stage[\"average_runtime_ms\"]), max_executors, min_task_runtime_ms))\n", " for stage in stages\n", " ]\n", - " \n", - " return extracted_data\n", "\n", - "if __name__ == \"__main__\":\n", - " file_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n", - " json_data = load_json(file_path)\n", - " result = extract_tpch_data(json_data)\n", - " print(json.dumps(result, indent=4))" + " return extracted_data" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 14, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'q1': [(0, 593, 18), (1, 1, 12), (2, 1, 12)],\n", - " 'q2': [(0, 100, 14),\n", - " (1, 100, 12),\n", - " (2, 100, 12),\n", - " (3, 100, 12),\n", - " (4, 146, 12),\n", - " (5, 35, 12),\n", - " (6, 35, 12),\n", - " (7, 1, 12),\n", - " (8, 1, 12),\n", - " (9, 2, 12),\n", - " (10, 1, 12),\n", - " (11, 1, 12),\n", - " (12, 13, 12),\n", - " (13, 1, 12),\n", - " (14, 1, 12),\n", - " (15, 1, 12),\n", - " (16, 1, 12)],\n", - " 'q3': [(0, 593, 18), (1, 133, 17), (2, 100, 12), (3, 139, 12), (4, 139, 12)],\n", - " 'q4': [(0, 593, 17), (1, 133, 14), (2, 1, 12), (3, 141, 12), (4, 1, 12)],\n", - " 'q5': [(0, 593, 19),\n", - " (1, 133, 13),\n", - " (2, 1, 12),\n", - " (3, 200, 12),\n", - " (4, 100, 12),\n", - " (5, 138, 12),\n", - " (6, 35, 12),\n", - " (7, 140, 12),\n", - " (8, 1, 12),\n", - " (9, 2, 12),\n", - " (10, 1, 12),\n", - " (11, 1, 12),\n", - " (12, 1, 12)],\n", - " 'q6': [(0, 593, 14), (1, 1, 12)],\n", - " 'q7': [(0, 593, 15),\n", - " (1, 133, 14),\n", - " (2, 100, 12),\n", - " (3, 140, 12),\n", - " (4, 137, 12),\n", - " (5, 149, 12),\n", - " (6, 1, 12),\n", - " (7, 35, 12),\n", - " (8, 1, 12),\n", - " (9, 1, 12),\n", - " (10, 1, 12),\n", - " (11, 1, 12)],\n", - " 'q8': [(0, 593, 20),\n", - " (1, 133, 15),\n", - " (2, 1, 12),\n", - " (3, 100, 12),\n", - " (4, 200, 12),\n", - " (5, 100, 12),\n", - " (6, 140, 12),\n", - " (7, 1, 12),\n", - " (8, 35, 12),\n", - " (9, 146, 12),\n", - " (10, 1, 12),\n", - " (11, 66, 12),\n", - " (12, 1, 12),\n", - " (13, 1, 12),\n", - " (14, 1, 12),\n", - " (15, 1, 12),\n", - " (16, 1, 12)],\n", - " 'q9': [(0, 593, 18),\n", - " (1, 100, 17),\n", - " (2, 133, 12),\n", - " (3, 100, 12),\n", - " (4, 200, 12),\n", - " (5, 138, 12),\n", - " (6, 143, 12),\n", - " (7, 35, 12),\n", - " (8, 144, 12),\n", - " (9, 1, 12),\n", - " (10, 1, 12),\n", - " (11, 1, 12),\n", - " (12, 1, 12)],\n", - " 'q10': [(0, 133, 13),\n", - " (1, 593, 12),\n", - " (2, 100, 12),\n", - " (3, 141, 12),\n", - " (4, 1, 12),\n", - " (5, 137, 12),\n", - " (6, 1, 12),\n", - " (7, 144, 12)],\n", - " 'q11': [(0, 100, 16),\n", - " (1, 100, 12),\n", - " (2, 1, 12),\n", - " (3, 100, 12),\n", - " (4, 35, 12),\n", - " (5, 40, 12),\n", - " (6, 1, 12)],\n", - " 'q12': [(0, 133, 12), (1, 593, 12), (2, 162, 12), (3, 1, 12), (4, 1, 12)],\n", - " 'q13': [(0, 133, 12),\n", - " (1, 100, 12),\n", - " (2, 149, 12),\n", - " (3, 66, 12),\n", - " (4, 1, 12),\n", - " (5, 1, 12)],\n", - " 'q14': [(0, 593, 12), (1, 100, 12), (2, 142, 12)],\n", - " 'q15': [(0, 593, 12),\n", - " (1, 1, 12),\n", - " (2, 139, 12),\n", - " (3, 35, 12),\n", - " (4, 138, 12),\n", - " (5, 138, 12)],\n", - " 'q16': [(0, 100, 12),\n", - " (1, 35, 12),\n", - " (2, 100, 12),\n", - " (3, 112, 12),\n", - " (4, 139, 12),\n", - " (5, 1, 12),\n", - " (6, 143, 12),\n", - " (7, 66, 12)],\n", - " 'q17': [(0, 593, 12), (1, 593, 12), (2, 94, 12), (3, 200, 12)],\n", - " 'q18': [(0, 133, 16),\n", - " (1, 593, 12),\n", - " (2, 593, 12),\n", - " (3, 200, 12),\n", - " (4, 100, 12),\n", - " (5, 100, 12)],\n", - " 'q19': [(0, 593, 12), (1, 100, 12), (2, 1, 12)],\n", - " 'q20': [(0, 100, 14),\n", - " (1, 593, 12),\n", - " (2, 100, 12),\n", - " (3, 100, 12),\n", - " (4, 140, 12),\n", - " (5, 1, 12),\n", - " (6, 2, 12),\n", - " (7, 1, 12),\n", - " (8, 5, 12)],\n", - " 'q21': [(0, 133, 12),\n", - " (1, 593, 12),\n", - " (2, 593, 12),\n", - " (3, 593, 12),\n", - " (4, 137, 12),\n", - " (5, 144, 12),\n", - " (6, 140, 12),\n", - " (7, 199, 12),\n", - " (8, 126, 12),\n", - " (9, 143, 12),\n", - " (10, 1, 12),\n", - " (11, 1, 12),\n", - " (12, 35, 12),\n", - " (13, 6, 12)],\n", - " 'q22': [(0, 133, 14), (1, 100, 12), (2, 142, 12), (3, 100, 12), (4, 1, 12)]}" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "result" + "def compute_resource_space(data):\n", + " \"\"\"Compute the total resource space required for each query.\"\"\"\n", + " return {query_id: sum(num_tasks * runtime for _, (num_tasks, runtime) in stages) for query_id, stages in data.items()}" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "def compute_resource_space(data):\n", - " resource_space = {}\n", - " for query_id, stages in data.items():\n", - " resource_space[query_id] = sum(num_tasks * runtime for _, num_tasks, runtime in stages)\n", - " return resource_space" + "def bucketize_queries(resource_space, bucket_size):\n", + " \"\"\"Classify queries into easy, medium, and hard buckets based on resource consumption.\"\"\"\n", + " buckets = {\"easy\": [], \"medium\": [], \"hard\": []}\n", + " for query_id, value in resource_space.items():\n", + " if value < bucket_size:\n", + " buckets[\"easy\"].append(query_id)\n", + " elif bucket_size <= value <= 2 * bucket_size:\n", + " buckets[\"medium\"].append(query_id)\n", + " else:\n", + " buckets[\"hard\"].append(query_id)\n", + " return buckets" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "query_resource_requirements = compute_resource_space(result)" + "def analyze_tpch_queries(json_path, bucket_size, dataset_size, max_executors, min_task_runtime_ms):\n", + " \"\"\"Main function to process TPCH queries and return categorized buckets.\"\"\"\n", + " json_data = load_json(json_path)\n", + " extracted_data = extract_tpch_data(json_data, dataset_size, max_executors, min_task_runtime_ms)\n", + " resource_requirements = compute_resource_space(extracted_data)\n", + " return bucketize_queries(resource_requirements, bucket_size)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'q1': 10698,\n", - " 'q2': 7868,\n", - " 'q3': 17471,\n", - " 'q4': 13659,\n", - " 'q5': 20436,\n", - " 'q6': 8314,\n", - " 'q7': 17549,\n", - " 'q8': 23395,\n", - " 'q9': 23138,\n", - " 'q10': 15133,\n", - " 'q11': 4924,\n", - " 'q12': 10680,\n", - " 'q13': 5400,\n", - " 'q14': 10020,\n", - " 'q15': 12528,\n", - " 'q16': 8352,\n", - " 'q17': 17760,\n", - " 'q18': 21160,\n", - " 'q19': 8328,\n", - " 'q20': 12704,\n", - " 'q21': 34128,\n", - " 'q22': 5978}" + "{'easy': [],\n", + " 'medium': ['q1',\n", + " 'q2',\n", + " 'q6',\n", + " 'q11',\n", + " 'q12',\n", + " 'q13',\n", + " 'q14',\n", + " 'q15',\n", + " 'q16',\n", + " 'q19',\n", + " 'q20',\n", + " 'q22'],\n", + " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q10', 'q17', 'q18', 'q21']}" ] }, - "execution_count": 30, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "query_resource_requirements" + "# Input params to create buckets\n", + "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n", + "bucket_size=8000\n", + "dataset_size=\"100g\"\n", + "max_executors=200\n", + "min_task_runtime_ms=12000\n", + "\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)\n", + "buckets" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q4', 'q6', 'q10', 'q12', 'q14', 'q15', 'q17', 'q20'],\n", + " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q18', 'q21']}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def bucketize_queries(resource_space):\n", - " buckets = {\"easy\": [], \"medium\": [], \"hard\": []}\n", - " for query_id, value in resource_space.items():\n", - " if value < 10000:\n", - " buckets[\"easy\"].append(query_id)\n", - " elif 10000 <= value <= 20000:\n", - " buckets[\"medium\"].append(query_id)\n", - " else:\n", - " buckets[\"hard\"].append(query_id)\n", - " return buckets" + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=8000, dataset_size=\"100g\", max_executors=100, min_task_runtime_ms=12000)\n", + "buckets" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q4', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "buckets = bucketize_queries(query_resource_requirements)" + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=7500, dataset_size=\"100g\", max_executors=100, min_task_runtime_ms=12000)\n", + "buckets" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'easy': ['q2', 'q6', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", - " 'medium': ['q1', 'q3', 'q4', 'q7', 'q10', 'q12', 'q14', 'q15', 'q17', 'q20'],\n", - " 'hard': ['q5', 'q8', 'q9', 'q18', 'q21']}" + "{'easy': ['q13', 'q16', 'q22'],\n", + " 'medium': ['q2', 'q6', 'q11', 'q12', 'q14', 'q15', 'q19', 'q20'],\n", + " 'hard': ['q1',\n", + " 'q3',\n", + " 'q4',\n", + " 'q5',\n", + " 'q7',\n", + " 'q8',\n", + " 'q9',\n", + " 'q10',\n", + " 'q17',\n", + " 'q18',\n", + " 'q21']}" ] }, - "execution_count": 33, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=9000, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n", "buckets" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Retry with rectangular mapping?" + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n", + "buckets" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From e4007266dbce9eaad4c948b30cf2ddbf0859c3fe Mon Sep 17 00:00:00 2001 From: Dhruv Garg Date: Thu, 20 Feb 2025 16:09:41 -0500 Subject: [PATCH 128/128] tpch partitions for 100g, 250g and diff max executors --- data/tpch_partitioning_analysis.ipynb | 143 +++++++++++++++----------- 1 file changed, 84 insertions(+), 59 deletions(-) diff --git a/data/tpch_partitioning_analysis.ipynb b/data/tpch_partitioning_analysis.ipynb index 0e3d1f7d..6b0eb160 100644 --- a/data/tpch_partitioning_analysis.ipynb +++ b/data/tpch_partitioning_analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -24,7 +24,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -103,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -117,48 +117,53 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Input params to create buckets\n", + "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n", + "bucket_size=8000\n", + "dataset_size=\"100g\"\n", + "max_executors=200\n", + "min_task_runtime_ms=12000\n", + "\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 100g dataset, varying the executors: 75, 100, 200" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'easy': [],\n", - " 'medium': ['q1',\n", - " 'q2',\n", - " 'q6',\n", - " 'q11',\n", - " 'q12',\n", - " 'q13',\n", - " 'q14',\n", - " 'q15',\n", - " 'q16',\n", - " 'q19',\n", - " 'q20',\n", - " 'q22'],\n", - " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q10', 'q17', 'q18', 'q21']}" + "{'easy': ['q11', 'q13', 'q14', 'q15', 'q19', 'q20', 'q22'],\n", + " 'medium': ['q1', 'q2', 'q4', 'q6', 'q10', 'q12', 'q16', 'q17', 'q18'],\n", + " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q21']}" ] }, - "execution_count": 17, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Input params to create buckets\n", - "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n", - "bucket_size=8000\n", - "dataset_size=\"100g\"\n", - "max_executors=200\n", - "min_task_runtime_ms=12000\n", - "\n", - "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=5500, dataset_size=\"100g\", max_executors=75, min_task_runtime_ms=12000)\n", "buckets" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -169,7 +174,7 @@ " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q18', 'q21']}" ] }, - "execution_count": 18, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -181,80 +186,100 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", - " 'medium': ['q1', 'q4', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + "{'easy': ['q6', 'q11', 'q13', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q2', 'q4', 'q10', 'q12', 'q14', 'q15', 'q16', 'q20'],\n", " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}" ] }, - "execution_count": 19, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=7500, dataset_size=\"100g\", max_executors=100, min_task_runtime_ms=12000)\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=10000, dataset_size=\"100g\", max_executors=200, min_task_runtime_ms=12000)\n", "buckets" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 250g dataset, varying the executors: 75, 100, 250" + ] + }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'easy': ['q13', 'q16', 'q22'],\n", - " 'medium': ['q2', 'q6', 'q11', 'q12', 'q14', 'q15', 'q19', 'q20'],\n", - " 'hard': ['q1',\n", - " 'q3',\n", - " 'q4',\n", - " 'q5',\n", - " 'q7',\n", - " 'q8',\n", - " 'q9',\n", - " 'q10',\n", - " 'q17',\n", - " 'q18',\n", - " 'q21']}" + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}" ] }, - "execution_count": 20, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=9000, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n", "buckets" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", - " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", - " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}" + " 'medium': ['q1', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}" ] }, - "execution_count": 21, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=100, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q1', 'q2', 'q6', 'q11', 'q13', 'q16', 'q22'],\n", + " 'medium': ['q4', 'q7', 'q10', 'q12', 'q14', 'q15', 'q19', 'q20'],\n", + " 'hard': ['q3', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=200, min_task_runtime_ms=12000)\n", "buckets" ] },