diff --git a/analyze.py b/analyze.py index a2219981..97933fbd 100644 --- a/analyze.py +++ b/analyze.py @@ -335,7 +335,7 @@ def analyze_resource_utilization( # Plotting defaults. # hatches = ['//', '--', '**'] # alphas = np.arange(0.2, 1.2, 0.2) - resource_color = {"GPU": "red", "CPU": "green"} + resource_color = {"Slot": "green"} # Worker Pool statistics worker_pool_stats = csv_reader.get_worker_pool_utilizations(scheduler_csv_file) @@ -1246,16 +1246,16 @@ def log_aggregate_stats( / sum(stat.resource_utilizations[resource]) for stat in worker_pool_stats ] - for resource in ("GPU", "CPU") + for resource in ("Slot",) } scheduler_invocations = csv_reader.get_scheduler_invocations(csv_file) placed_tasks = [ - scheduler_invocation.placed_tasks + scheduler_invocation.num_placed_tasks for scheduler_invocation in scheduler_invocations ] unplaced_tasks = [ - scheduler_invocation.unplaced_tasks + scheduler_invocation.num_unplaced_tasks for scheduler_invocation in scheduler_invocations ] @@ -1268,8 +1268,7 @@ def log_aggregate_stats( placement_delay, deadline_delay, stat_function(e2e_response_time), - stat_function(resource_uses["GPU"]), - stat_function(resource_uses["CPU"]), + stat_function(resource_uses["Slot"]), stat_function(placed_tasks), stat_function(unplaced_tasks), log_name, @@ -1288,8 +1287,7 @@ def log_aggregate_stats( "Placement", "Deadline", "JCT", - "GPU", - "CPU", + "Slot", "Placed", "Unplaced", "Log", diff --git a/configs/tpch_replay_dsched.conf b/configs/tpch_replay_dsched.conf new file mode 100644 index 00000000..1b839546 --- /dev/null +++ b/configs/tpch_replay_dsched.conf @@ -0,0 +1,38 @@ +# Output configs. +--log=./tpch_replay_dsched.log +--log_level=debug +--csv=./tpch_replay_dsched.csv + +# Task configs. +--runtime_variance=0 + +# Scheduler configs. + +# DSched +--scheduler=TetriSched +--scheduler_runtime=0 +--enforce_deadlines +--retract_schedules +--release_taskgraphs +--drop_skipped_tasks +--scheduler_time_discretization=1 + +# Deadline variance +--min_deadline_variance=10 +--max_deadline_variance=25 + +# Execution mode configs. +--execution_mode=replay +--replay_trace=tpch + +# Release time config. +--override_release_policy=gamma +--override_gamma_coefficient=1 +--override_poisson_arrival_rate=1 +--override_num_invocation=10 + +# TPCH flags +--random_seed=1234 +--tpch_query_dag_spec=profiles/workload/tpch/queries.yaml +--tpch_dataset_size=50 +--worker_profile_path=profiles/workers/tpch_cluster.yaml diff --git a/configs/tpch_replay_edf.conf b/configs/tpch_replay_edf.conf new file mode 100644 index 00000000..cf23650a --- /dev/null +++ b/configs/tpch_replay_edf.conf @@ -0,0 +1,47 @@ +# Output configs. +# --log=./tpch_replay_dsched.log +# --log_level=debug +# --csv=./tpch_replay_dsched.csv + +--log=./tpch_replay_edf.log +--log_level=debug +--csv=./tpch_replay_edf.csv + +# Task configs. +--runtime_variance=0 + +# Scheduler configs. + +# EDF +--scheduler=EDF +--scheduler_runtime=0 +--enforce_deadlines + +# DSched +# --scheduler=TetriSched +# --scheduler_runtime=0 +# --enforce_deadlines +# --retract_schedules +# --release_taskgraphs +# --drop_skipped_tasks +# --scheduler_time_discretization=1 + +# Deadline variance +--min_deadline_variance=10 +--max_deadline_variance=25 + +# Execution mode configs. +--execution_mode=replay +--replay_trace=tpch + +# Release time config. +--override_release_policy=gamma +--override_gamma_coefficient=1 +--override_poisson_arrival_rate=1 +--override_num_invocation=10 + +# TPCH flags +--random_seed=1234 +--tpch_query_dag_spec=profiles/workload/tpch/queries.yaml +--tpch_dataset_size=50 +--worker_profile_path=profiles/workers/tpch_cluster.yaml diff --git a/data/__init__.py b/data/__init__.py index ec2c2986..9db1ee5b 100644 --- a/data/__init__.py +++ b/data/__init__.py @@ -7,6 +7,7 @@ from .task_loader_benchmark import TaskLoaderBenchmark from .task_loader_pylot import TaskLoaderPylot from .task_loader_synthetic import TaskLoaderSynthetic +from .tpch_loader import TpchWorkloadLoader from .worker_loader import WorkerLoader from .worker_loader_benchmark import WorkerLoaderBenchmark from .workload_loader import WorkloadLoader diff --git a/data/csv_reader.py b/data/csv_reader.py index d4d0d1f4..b81e0767 100644 --- a/data/csv_reader.py +++ b/data/csv_reader.py @@ -63,6 +63,11 @@ def parse_events(self, readings: Mapping[str, Sequence[str]]): ) elif reading[1] == "UPDATE_WORKLOAD": simulator.total_tasks += int(reading[2]) + elif reading[1] == "LOG_STATS": + assert ( + simulator is not None + ), "No SIMULATOR_START found for a corresponding SIMULATOR_END." + simulator.update_stats(reading) elif reading[1] == "SIMULATOR_END": assert ( simulator is not None diff --git a/data/csv_types.py b/data/csv_types.py index 390cd0a6..851299f8 100644 --- a/data/csv_types.py +++ b/data/csv_types.py @@ -385,6 +385,18 @@ def __init__(self, csv_path: str, start_time: int, total_tasks: int = 0): self.scheduler_invocations: list[Scheduler] = [] self.task_graphs: dict[str, TaskGraph] = {} + def update_stats(self, csv_reading: str): + assert ( + csv_reading[1] == "LOG_STATS" + ), f"The event {csv_reading[1]} was not of type LOG_STATS." + self.finished_tasks = int(csv_reading[2]) + self.dropped_tasks = int(csv_reading[3]) + self.missed_deadlines = int(csv_reading[4]) + self.finished_task_graphs = int(csv_reading[5]) + self.dropped_taskgraphs = int(csv_reading[6]) + self.missed_taskgraphs = int(csv_reading[7]) + self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs + def update_finish(self, csv_reading: str): """Updates the values of the Simulator based on the SIMULATOR_END event from CSV. @@ -396,10 +408,10 @@ def update_finish(self, csv_reading: str): csv_reading[1] == "SIMULATOR_END" ), f"The event {csv_reading[1]} was not of type SIMULATOR_END." self.end_time = int(csv_reading[0]) - self.finished_tasks = int(csv_reading[2]) - self.dropped_tasks = int(csv_reading[3]) - self.missed_deadlines = int(csv_reading[4]) - self.finished_task_graphs = int(csv_reading[5]) - self.dropped_taskgraphs = int(csv_reading[6]) - self.missed_taskgraphs = int(csv_reading[7]) - self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs + # self.finished_tasks = int(csv_reading[2]) + # self.dropped_tasks = int(csv_reading[3]) + # self.missed_deadlines = int(csv_reading[4]) + # self.finished_task_graphs = int(csv_reading[5]) + # self.dropped_taskgraphs = int(csv_reading[6]) + # self.missed_taskgraphs = int(csv_reading[7]) + # self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs diff --git a/data/tpch_loader.py b/data/tpch_loader.py new file mode 100644 index 00000000..57ea6816 --- /dev/null +++ b/data/tpch_loader.py @@ -0,0 +1,600 @@ +import os +import math +import json +import sys +import random + +from typing import Any, Dict, List, Optional, Callable, Tuple +from pathlib import Path +from enum import Enum + +import absl +import numpy as np +import yaml +import networkx as nx + +from utils import EventTime, setup_logging +from workload import ( + Workload, + WorkProfile, + TaskGraph, + Job, + JobGraph, + ExecutionStrategy, + ExecutionStrategies, + Resource, + Resources, +) + +from .base_workload_loader import BaseWorkloadLoader + + +class TpchQueryDifficulty(Enum): + easy = {1, 3, 4, 6, 12, 14, 17, 19, 22} + medium = {10, 11, 13, 15, 16, 18, 20} + hard = {2, 7, 8, 9, 21} + + +class TpchLoader: + """Construct TPC-H task graph from a query profile + + Args: + path (`str`): Path to a YAML file specifying the TPC-H query DAGs + flags (`absl.flags`): The flags used to initialize the app, if any + + """ + + def __init__( + self, + path: Path, + flags: "absl.flags", + ): + self._logger = setup_logging( + name=self.__class__.__name__, + log_dir=flags.log_dir, + log_file=flags.log_file_name, + log_level=flags.log_level, + ) + self._flags = flags + + # Load the TPC-H DAG structures + with open(path, "r") as f: + workload_data = yaml.safe_load(f) + self._graphs = {} + for query in workload_data["graphs"]: + query_num = int(query["name"][1:]) + self._graphs[query_num] = query["graph"] + + def make_job_graph( + self, + id: str, + query_num: int, + dependencies: Optional[List[Dict[str, Any]]] = None, + profile_type: Optional[str] = None, + dataset_size: Optional[int] = None, + max_executors_per_job: Optional[int] = None, + min_task_runtime: Optional[int] = None, + runtime_unit: EventTime.Unit = EventTime.Unit.US, + ) -> Tuple[TaskGraph, Dict[int, int]]: + if profile_type is None: + profile_type = self._flags.tpch_profile_type + if dataset_size is None: + dataset_size = self._flags.tpch_dataset_size + if max_executors_per_job is None: + max_executors_per_job = self._flags.tpch_max_executors_per_job + if min_task_runtime is None: + min_task_runtime = self._flags.tpch_min_task_runtime + + # Normalize dependencies + if dependencies is None: + dependencies = self._graphs[query_num] + deps_mapping = None + else: + deps_mapping = self.__map_dependencies(query_num, dependencies) + for node in dependencies: + node["name"] = deps_mapping[node["name"]] + if "children" in node: + node["children"] = [deps_mapping[c] for c in node["children"]] + self._logger.info( + f"Mapped dependencies for TPC-H query {query_name(query_num)} as {deps_mapping}." + ) + + # Construct a JobGraph + job_graph = JobGraph(name=task_graph_name(query_num, id)) + profiler_data = get_all_stage_info_for_query( + query_num, + profile_type, + dataset_size, + max_executors_per_job, + ) + name_to_job = {} + for node in dependencies: + worker_profile = self.__make_work_profile( + profiler_data=profiler_data, + query_num=query_num, + node_name=node["name"], + max_executors_per_job=max_executors_per_job, + min_task_runtime=min_task_runtime, + runtime_unit=runtime_unit, + ) + job = Job( + name=node["name"], + profile=worker_profile, + ) + name_to_job[node["name"]] = job + job_graph.add_job(job=job) + for node in dependencies: + job = name_to_job[node["name"]] + if "children" in node: + for child in node["children"]: + if child not in name_to_job: + raise ValueError( + f"Child {child} of {node['name']} was " + f"not present in the graph." + ) + child_job = name_to_job[child] + job_graph.add_child(job, child_job) + + self._logger.info( + f"Constructed JobGraph for TPC-H query {query_name(query_num)}." + ) + + return job_graph, deps_mapping + + def __make_work_profile( + self, + profiler_data: Dict[int, Dict[str, Any]], + query_num: int, + node_name: str, + max_executors_per_job: int, + min_task_runtime: int, + runtime_unit: EventTime, + ) -> WorkProfile: + profile = profiler_data[int(node_name)] + + profiled_task_slots = profile["num_tasks"] + profiled_runtime = math.ceil(profile["avg_task_duration_ms"] / 1e3) + + if profiled_task_slots > max_executors_per_job: + num_slots = max_executors_per_job + runtime = math.ceil( + (profiled_task_slots * profiled_runtime) / max_executors_per_job + ) + self._logger.debug( + "%s@%s: num_slots (%s) > max_executors_per_job (%s). Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + node_name, + query_name(query_num), + profiled_task_slots, + max_executors_per_job, + profiled_task_slots, + profiled_runtime, + num_slots, + runtime, + ) + else: + num_slots = profiled_task_slots + runtime = profiled_runtime + + if runtime < min_task_runtime: + _runtime = runtime + runtime = max(min_task_runtime, _runtime) + self._logger.debug( + "%s@%s: runtime (%s) < min_task_runtime (%s). Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + node_name, + query_name(query_num), + _runtime, + min_task_runtime, + num_slots, + _runtime, + num_slots, + runtime, + ) + + resources = Resources( + resource_vector={ + Resource(name="Slot", _id="any"): num_slots, + }, + ) + execution_strategies = ExecutionStrategies() + execution_strategies.add_strategy( + strategy=ExecutionStrategy( + resources=resources, + batch_size=1, + runtime=EventTime(runtime, runtime_unit), + ), + ) + return WorkProfile( + name=f"{query_name(query_num)}_{node_name}_execution_profile", + execution_strategies=execution_strategies, + ) + + def __map_dependencies(self, query_num: int, deps: List[Dict[str, Any]]): + def deps_to_nx_graph(deps: List[Dict[str, Any]]): + query_dependency = [] + for node in deps: + if "children" in node: + for child in node["children"]: + query_dependency.append((node["name"], child)) + else: + # Ensure each tuple has two elements by adding a dummy node + query_dependency.append((node["name"], None)) + + # Remove any tuples where the second element is None + query_dependency = [ + edge for edge in query_dependency if edge[1] is not None + ] + + # convert job structure into a nx graph + nx_deps = nx.DiGraph(query_dependency) + + return nx_deps + + def are_structurally_same(graph1, graph2): + # Step 1: Check if both graphs have the same number of vertices + if len(graph1.nodes) != len(graph2.nodes): + return False, None + + # Step 2: Check if there exists a bijection between the vertices + # of the two graphs such that their adjacency relationships match + for mapping in nx.isomorphism.GraphMatcher( + graph1, graph2 + ).isomorphisms_iter(): + # Check if the adjacency relationships match + if all(v in mapping for u, v in graph1.edges): + # graph structures match + # mapping is a dict {key=original-stage-id, val=app-stage-id} + # we reverse reversed mapping from app-stage-id to orig-stage-id + reversed_mapping = {v: k for k, v in mapping.items()} + return True, reversed_mapping + + return False, None + + base_deps = self._graphs[query_num] + is_same, mapping = are_structurally_same( + deps_to_nx_graph(base_deps), deps_to_nx_graph(deps) + ) + + if not is_same: + raise ValueError( + f"Structure of dependencies provided for query number {query_num} does not match that of canonical dependencies. Provided: {deps}. Canonical: {base_deps}" + ) + + return mapping + + @property + def num_queries(self) -> int: + return len(self._graphs) + + +class TpchWorkloadLoader(BaseWorkloadLoader): + """Construct a TPC-H query workload + + Args: + flags (`absl.flags`): The flags used to initialize the app, if any + """ + + def __init__(self, flags: "absl.flags") -> None: + self._flags = flags + self._logger = setup_logging( + name=self.__class__.__name__, + log_dir=flags.log_dir, + log_file=flags.log_file_name, + log_level=flags.log_level, + ) + self._rng_seed = flags.random_seed + self._rng = random.Random(self._rng_seed) + if flags.workload_update_interval > 0: + self._workload_update_interval = flags.workload_update_interval + else: + self._workload_update_interval = EventTime(sys.maxsize, EventTime.Unit.US) + + # Instantiate tpch loader + self._tpch_loader = TpchLoader(path=flags.tpch_query_dag_spec, flags=flags) + + # Intialize [(query_num, release_time)] + self._query_nums_and_release_times = [] + if len(flags.override_num_invocations) > 0: + # One each for easy, medium, and hard + assert len(flags.override_num_invocations) == len(TpchQueryDifficulty) + assert len(flags.override_poisson_arrival_rates) == len( + flags.override_num_invocations + ) + + # only works with poisson distribution + assert flags.override_release_policy == "poisson" + + for i, part in enumerate(TpchQueryDifficulty): + release_policy = self.__make_release_policy( + policy_type=flags.override_release_policy, + arrival_rate=float(flags.override_poisson_arrival_rates[i]), + num_invocations=int(flags.override_num_invocations[i]), + ) + release_times = release_policy.get_release_times( + completion_time=EventTime( + self._flags.loop_timeout, EventTime.Unit.US + ) + ) + query_nums = [ + self._rng.choice(list(part.value)) + for _ in range(int(flags.override_num_invocations[i])) + ] + self._query_nums_and_release_times.extend( + list(zip(query_nums, release_times)) + ) + + self._query_nums_and_release_times.sort(key=lambda x: x[1]) + else: + release_policy = self.__make_release_policy() + release_times = release_policy.get_release_times( + completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US) + ) + query_nums = [ + self._rng.randint(1, self._tpch_loader.num_queries) + for _ in range(self._flags.override_num_invocation) + ] + self._query_nums_and_release_times.extend( + list(zip(query_nums, release_times)) + ) + + self._current_release_pointer = 0 + + # Initialize workload + self._workload = Workload.empty(flags) + + def __make_release_policy( + self, policy_type=None, arrival_rate=None, num_invocations=None + ): + if policy_type is None: + policy_type = self._flags.override_release_policy + if arrival_rate is None: + arrival_rate = self._flags.override_poisson_arrival_rate + if num_invocations is None: + num_invocations = self._flags.override_num_invocation + + release_policy_args = {} + if policy_type == "periodic": + release_policy_args = { + "period": EventTime( + self._flags.override_arrival_period, EventTime.Unit.US + ), + } + elif policy_type == "fixed": + release_policy_args = { + "period": EventTime( + self._flags.override_arrival_period, EventTime.Unit.US + ), + "num_invocations": num_invocations, + } + elif policy_type == "poisson": + release_policy_args = { + "rate": arrival_rate, + "num_invocations": num_invocations, + } + elif policy_type == "gamma": + release_policy_args = { + "rate": arrival_rate, + "num_invocations": num_invocations, + "coefficient": self._flags.override_gamma_coefficient, + } + elif policy_type == "fixed_gamma": + release_policy_args = { + "variable_arrival_rate": arrival_rate, + "base_arrival_rate": self._flags.override_base_arrival_rate, + "num_invocations": num_invocations, + "coefficient": self._flags.override_gamma_coefficient, + } + else: + raise NotImplementedError(f"Release policy {policy_type} not implemented.") + + return make_release_policy( + policy_type, + release_policy_args, + self._rng, + self._rng_seed, + ( + self._flags.randomize_start_time_min, + self._flags.randomize_start_time_max, + ), + ) + + def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: + # Reset rng if this is the first workload. This is to ensure we have + # parity with how jobs are spawned in Spark + if self._current_release_pointer == 0: + self._rng = random.Random(self._rng_seed) + + to_release = [] + while ( + self._current_release_pointer < len(self._query_nums_and_release_times) + and self._query_nums_and_release_times[self._current_release_pointer][1] + <= current_time + self._workload_update_interval + ): + to_release.append( + self._query_nums_and_release_times[self._current_release_pointer] + ) + self._current_release_pointer += 1 + + if ( + self._current_release_pointer >= len(self._query_nums_and_release_times) + and len(to_release) == 0 + ): + # Nothing left to release + return None + + for i, (q, t) in enumerate(to_release): + job_graph, _ = self._tpch_loader.make_job_graph( + id=str(i), + query_num=q, + ) + task_graph = job_graph.get_next_task_graph( + start_time=t, + _flags=self._flags, + ) + self._workload.add_task_graph(task_graph) + + return self._workload + + +def query_name(query_num: int) -> str: + return f"Q{query_num}" + + +def task_graph_name(query_num: int, id: any) -> str: + return f"{query_name(query_num)}[{id}]" + + +def make_release_policy( + release_policy, release_policy_args, rng, seed, randomize_start_time=(0, 0) +): + # Check that none of the arg values are None + assert all([val is not None for val in release_policy_args.values()]) + + # Construct the release policy + start_time = EventTime( + time=rng.randint(*randomize_start_time), + unit=EventTime.Unit.US, + ) + release_policy = getattr(JobGraph.ReleasePolicy, release_policy)( + start=start_time, rng_seed=seed, **release_policy_args + ) + + return release_policy + + +# TODO: make configurable +TPCH_SUBDIR = "100g/" +DECIMA_TPCH_DIR = ( + Path(__file__).resolve().parent / ".." / "profiles/workload/tpch/decima/" +) +CLOUDLAB_TPCH_DIR = ( + Path(__file__).resolve().parent / ".." / "profiles/workload/tpch/cloudlab/" +) + + +class SetWithCount(object): + """ + allow duplication in set + """ + + def __init__(self): + self.set = {} + + def __contains__(self, item): + return item in self.set + + def add(self, item): + if item in self.set: + self.set[item] += 1 + else: + self.set[item] = 1 + + def clear(self): + self.set.clear() + + def remove(self, item): + self.set[item] -= 1 + if self.set[item] == 0: + del self.set[item] + + +def pre_process_task_duration(task_duration): + # remove fresh durations from first wave + clean_first_wave = {} + for e in task_duration["first_wave"]: + clean_first_wave[e] = [] + fresh_durations = SetWithCount() + # O(1) access + for d in task_duration["fresh_durations"][e]: + fresh_durations.add(d) + for d in task_duration["first_wave"][e]: + if d not in fresh_durations: + clean_first_wave[e].append(d) + else: + # prevent duplicated fresh duration blocking first wave + fresh_durations.remove(d) + + +def get_all_stage_info_for_query(query_num, profile_type, dataset_size, max_executors): + stage_info = {} + if profile_type == "Decima": + stage_info = use_decima_tpch_profile(query_num, dataset_size) + elif profile_type == "Cloudlab": + stage_info = use_cloudlab_profile(query_num, dataset_size, max_executors) + else: + raise ValueError(f"Invalid profile type: {profile_type}") + + return stage_info + + +def use_cloudlab_profile(query_num, dataset_size, max_executors): + cloudlab_profile_json = os.path.join( + CLOUDLAB_TPCH_DIR, "cloudlab_22query_tpch_profiles.json" + ) + with open(cloudlab_profile_json, "r") as file: + data = json.load(file) + + query_key_to_extract = ( + "tpch_q" + + str(query_num) + + "_" + + str(dataset_size) + + "g" + + "_maxCores_" + + str(max_executors) + ) + required_query_profile = data[query_key_to_extract] + + stage_info = {} + + for i, stage_profile in enumerate(required_query_profile): + curr_stage = { + "stage_id": i, + "num_tasks": stage_profile["num_tasks"], + "avg_task_duration_ms": round(stage_profile["average_runtime_ms"]), + } + stage_info[i] = curr_stage + + return stage_info + + +def use_decima_tpch_profile(query_num, dataset_size): + task_durations = np.load( + os.path.join( + DECIMA_TPCH_DIR, dataset_size, "task_duration_" + str(query_num) + ".npy" + ), + allow_pickle=True, + ).item() + + num_nodes = len(task_durations) + + stage_info = {} + + for n in range(num_nodes): + task_duration = task_durations[n] + e = next(iter(task_duration["first_wave"])) + # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} + + num_tasks = len(task_duration["first_wave"][e]) + len( + task_duration["rest_wave"][e] + ) + + # remove fresh duration from first wave duration + # drag nearest neighbor first wave duration to empty spots + pre_process_task_duration(task_duration) + rough_duration = np.mean( + [i for t in task_duration["first_wave"].values() for i in t] + + [i for t in task_duration["rest_wave"].values() for i in t] + + [i for t in task_duration["fresh_durations"].values() for i in t] + ) + + # NOTE: Runtime per task is given in milliseconds + curr_stage = { + "stage_id": n, + "num_tasks": num_tasks, + "avg_task_duration_ms": round(rough_duration), + } + stage_info[n] = curr_stage + + return stage_info diff --git a/data/tpch_partitioning_analysis.ipynb b/data/tpch_partitioning_analysis.ipynb new file mode 100644 index 00000000..6b0eb160 --- /dev/null +++ b/data/tpch_partitioning_analysis.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def load_json(file_path):\n", + " \"\"\"Load JSON data from a file.\"\"\"\n", + " with open(file_path, \"r\") as f:\n", + " return json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def recompute_tasks_runtime(stage_id, num_tasks, avg_runtime_ms, max_executors, min_task_runtime_ms):\n", + " \"\"\"Recompute the number of tasks and runtime per task while enforcing constraints.\"\"\"\n", + " profiled_runtime = math.ceil(avg_runtime_ms / 1000) # Convert ms to seconds\n", + "\n", + " if num_tasks > max_executors:\n", + " adjusted_runtime = math.ceil((num_tasks * profiled_runtime) / max_executors)\n", + " adjusted_num_tasks = max_executors\n", + " else:\n", + " adjusted_runtime = profiled_runtime\n", + " adjusted_num_tasks = num_tasks\n", + "\n", + " final_runtime = max(math.ceil(min_task_runtime_ms / 1000), adjusted_runtime) # Enforce min runtime\n", + "\n", + " # print(\n", + " # f\"Stage {stage_id}: num_tasks ({num_tasks}) -> {adjusted_num_tasks}, \"\n", + " # f\"runtime_s ({profiled_runtime}) -> {final_runtime}\"\n", + " # )\n", + "\n", + " return adjusted_num_tasks, final_runtime" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_tpch_data(json_data, dataset_size, max_executors, min_task_runtime_ms):\n", + " \"\"\"Extract TPCH query data, filter relevant queries, and recompute runtime.\"\"\"\n", + " extracted_data = {}\n", + "\n", + " for query_key, stages in json_data.items():\n", + " if dataset_size in query_key and \"maxCores_\" + str(max_executors) in query_key:\n", + " query_id = query_key.split(\"_\")[1] # Extract query number (e.g., 'q1')\n", + " # print(f\"--------Processing query {query_id}\")\n", + " extracted_data[query_id] = [\n", + " (stage[\"stage_id\"], recompute_tasks_runtime(stage[\"stage_id\"], stage[\"num_tasks\"], int(stage[\"average_runtime_ms\"]), max_executors, min_task_runtime_ms))\n", + " for stage in stages\n", + " ]\n", + "\n", + " return extracted_data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_resource_space(data):\n", + " \"\"\"Compute the total resource space required for each query.\"\"\"\n", + " return {query_id: sum(num_tasks * runtime for _, (num_tasks, runtime) in stages) for query_id, stages in data.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def bucketize_queries(resource_space, bucket_size):\n", + " \"\"\"Classify queries into easy, medium, and hard buckets based on resource consumption.\"\"\"\n", + " buckets = {\"easy\": [], \"medium\": [], \"hard\": []}\n", + " for query_id, value in resource_space.items():\n", + " if value < bucket_size:\n", + " buckets[\"easy\"].append(query_id)\n", + " elif bucket_size <= value <= 2 * bucket_size:\n", + " buckets[\"medium\"].append(query_id)\n", + " else:\n", + " buckets[\"hard\"].append(query_id)\n", + " return buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def analyze_tpch_queries(json_path, bucket_size, dataset_size, max_executors, min_task_runtime_ms):\n", + " \"\"\"Main function to process TPCH queries and return categorized buckets.\"\"\"\n", + " json_data = load_json(json_path)\n", + " extracted_data = extract_tpch_data(json_data, dataset_size, max_executors, min_task_runtime_ms)\n", + " resource_requirements = compute_resource_space(extracted_data)\n", + " return bucketize_queries(resource_requirements, bucket_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Input params to create buckets\n", + "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n", + "bucket_size=8000\n", + "dataset_size=\"100g\"\n", + "max_executors=200\n", + "min_task_runtime_ms=12000\n", + "\n", + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 100g dataset, varying the executors: 75, 100, 200" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q11', 'q13', 'q14', 'q15', 'q19', 'q20', 'q22'],\n", + " 'medium': ['q1', 'q2', 'q4', 'q6', 'q10', 'q12', 'q16', 'q17', 'q18'],\n", + " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q21']}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=5500, dataset_size=\"100g\", max_executors=75, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q4', 'q6', 'q10', 'q12', 'q14', 'q15', 'q17', 'q20'],\n", + " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q18', 'q21']}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=8000, dataset_size=\"100g\", max_executors=100, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q6', 'q11', 'q13', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q2', 'q4', 'q10', 'q12', 'q14', 'q15', 'q16', 'q20'],\n", + " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=10000, dataset_size=\"100g\", max_executors=200, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 250g dataset, varying the executors: 75, 100, 250" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n", + " 'medium': ['q1', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n", + " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=100, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'easy': ['q1', 'q2', 'q6', 'q11', 'q13', 'q16', 'q22'],\n", + " 'medium': ['q4', 'q7', 'q10', 'q12', 'q14', 'q15', 'q19', 'q20'],\n", + " 'hard': ['q3', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=200, min_task_runtime_ms=12000)\n", + "buckets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dg_erdos", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/main.py b/main.py index b2df225a..b77e0b6d 100644 --- a/main.py +++ b/main.py @@ -9,6 +9,7 @@ TaskLoaderBenchmark, TaskLoaderPylot, TaskLoaderSynthetic, + TpchWorkloadLoader, WorkerLoader, WorkerLoaderBenchmark, WorkloadLoader, @@ -34,7 +35,7 @@ flags.DEFINE_enum( "replay_trace", "pylot", - ["pylot", "clockwork_bursty", "alibaba"], + ["pylot", "clockwork_bursty", "alibaba", "tpch"], "Sets the trace to replay in the replay mode.", ) flags.DEFINE_string( @@ -115,6 +116,17 @@ "If set to default (-1), then the Simulator will automatically choose an interval " "based on the set of released tasks in the previous iteration.", ) +flags.DEFINE_bool( + "orchestrated", + False, + "Runs the simulator in orchestrated mode. Currently used by the ERDOS service.", +) +flags.DEFINE_integer( + "min_placement_push_duration", + 1, + "The duration (in µs) by which to push a task placement if it cannot be" + "placed on a worker at its original time", +) # Benchmark related flags. flags.DEFINE_integer( @@ -130,6 +142,40 @@ "benchmark_num_cpus", 10, "Number of CPUs available for benchmarking." ) +# TPCH related flags +flags.DEFINE_string( + "tpch_query_dag_spec", + "./profiles/workload/tpch/queries.yaml", + "Path to a YAML file specifying the TPC-H query DAGs", +) +flags.DEFINE_integer( + "tpch_num_queries", + 50, + "Number of TPC-H queries to run", +) +flags.DEFINE_enum( + "tpch_profile_type", + "Cloudlab", + ["Cloudlab", "Decima"], + "Type of TPC-H profile the data loader must use", +) +flags.DEFINE_enum( + "tpch_dataset_size", + "50", + ["2", "50", "100", "250", "500"], + "Size of the TPC-H dataset to use", +) +flags.DEFINE_integer( + "tpch_max_executors_per_job", + 50, + "Maximum number of executors to use per TPC-H query stage", +) +flags.DEFINE_integer( + "tpch_min_task_runtime", + 8, + "Minimum runtime of a TPC-H task", +) + # AlibabaLoader related flags. flags.DEFINE_integer( "alibaba_loader_task_cpu_multiplier", @@ -345,8 +391,9 @@ flags.DEFINE_integer( "scheduler_max_time_discretization", 5, - "The maximum discretization that the scheduler can have (in µs). " - "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)", + "The maximum discretization that the scheduler can have. " + "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)." + "Be careful about the EventTime.Unit. Some parts of the code assume Unit.US", ) flags.DEFINE_float( "scheduler_max_occupancy_threshold", @@ -385,9 +432,10 @@ "scheduler_time_discretization", 1, "The length of each slot in the space-time matrix to consider for scheduling the " - "tasks (in µs). The default value is 1µs, and a higher value can lead to faster " + "tasks. The default value is 1 (see note for unit), and a higher value can lead to faster " "solutions but a potentially lower goodput due to resources being blocked for the " - "entirety of the slot.", + "entirety of the slot. NOTE: Since time in the simulator is an abstract concept, be " + "careful about the EventTime.Unit. Some parts of the code might assume Unit.US", ) flags.DEFINE_enum( "scheduler_policy", @@ -473,7 +521,7 @@ "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.", ) flags.DEFINE_multi_enum( - "optimization_passes", + "opt_passes", [], [ "CRITICAL_PATH_PASS", @@ -633,6 +681,8 @@ def main(args): ), flags=FLAGS, ) + elif FLAGS.replay_trace == "tpch": + workload_loader = TpchWorkloadLoader(flags=FLAGS) else: raise NotImplementedError( f"Replay trace {FLAGS.replay_trace} is not implemented yet." diff --git a/profiles/workers/tpch_cluster.yaml b/profiles/workers/tpch_cluster.yaml new file mode 100644 index 00000000..582302b2 --- /dev/null +++ b/profiles/workers/tpch_cluster.yaml @@ -0,0 +1,6 @@ +- name: WorkerPool_1 + workers: + - name: Worker_1_1 + resources: + - name: Slot + quantity: 640 diff --git a/requirements.txt b/requirements.txt index f3e8957c..4be1c543 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,3 +14,4 @@ cplex pre-commit black isort +networkx diff --git a/rpc/README.md b/rpc/README.md index 294e2287..dcb13dbe 100644 --- a/rpc/README.md +++ b/rpc/README.md @@ -5,26 +5,26 @@ The package provides support for connecting frameworks to the ERDOS Simulator, w This code is being tested with Apache Spark v3.5.0 (with additional instrumentation outlined in [this](https://github.com/dhruvsgarg/spark_mirror/tree/erdos-spark-integration) repository) -To get the RPC service setup, first install the required packages using: +To get the RPC service setup, from the ERDOS root directory, install the required packages using: ```bash -pip install -r requirements.txt +pip install -r rpc/requirements.txt ``` Then, run protoc to generate the service and message definitions using: ```bash -python -m grpc_tools.protoc -I./protos --python_out=./ --grpc_python_out=./ ./protos/erdos_scheduler.proto +python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto ``` and run the service using: ```bash -python service.py +python -m rpc.service ``` You can also find the supported flags by the service, by running ```bash -python service.py --help +python -m rpc.service --help ``` diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py new file mode 100644 index 00000000..2d0f91bf --- /dev/null +++ b/rpc/launch_tpch_queries.py @@ -0,0 +1,256 @@ +import argparse +import os +import random +import subprocess +import sys +import time +import numpy as np + +from pathlib import Path + +from workload import JobGraph +from utils import EventTime +from data.tpch_loader import make_release_policy +from rpc import erdos_scheduler_pb2 +from rpc import erdos_scheduler_pb2_grpc + +import grpc + + +def map_dataset_to_deadline(dataset_size): + # 50gb => 2mins, 100gb => 6mins, 250gb => 12mins, 500gb => 24mins + mapping = {"50": 120, "100": 360, "250": 720, "500": 1440} + return mapping.get(dataset_size, 120) # Default to 120s if dataset size is NA + + +def launch_query(query_number, index, args): + deadline = map_dataset_to_deadline(args.dataset_size) + + cmd = [ + f"{args.spark_mirror_path.resolve()}/bin/spark-submit", + *("--deploy-mode", "cluster"), + *("--master", f"spark://{args.spark_master_ip}:7077"), + *("--conf", "'spark.port.maxRetries=132'"), + *("--conf", "'spark.eventLog.enabled=true'"), + *("--conf", f"'spark.eventLog.dir={args.spark_eventlog_dir.resolve()}'"), + *("--conf", "'spark.sql.adaptive.enabled=false'"), + *("--conf", "'spark.sql.adaptive.coalescePartitions.enabled=false'"), + *("--conf", "'spark.sql.autoBroadcastJoinThreshold=-1'"), + *("--conf", "'spark.sql.shuffle.partitions=1'"), + *("--conf", "'spark.sql.files.minPartitionNum=1'"), + *("--conf", "'spark.sql.files.maxPartitionNum=1'"), + *("--conf", f"'spark.app.deadline={deadline}'"), + *("--class", "'main.scala.TpchQuery'"), + f"{args.tpch_spark_path.resolve()}/target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar", + f"{query_number}", + f"{index}", + f"{args.dataset_size}", + f"{args.max_cores}", + ] + + # print( + # f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Launching Query: {query_number}, " + # f"dataset: {args.dataset_size}GB, deadline: {deadline}s, maxCores: {args.max_cores}" + # ) + + try: + cmd = " ".join(cmd) + print("Launching:", cmd) + p = subprocess.Popen( + cmd, + shell=True, + ) + print("Query launched successfully.") + return p + except Exception as e: + print(f"Error launching query: {e}") + + +def generate_release_times(rng, args): + if args.distribution == "periodic": + release_policy_args = { + "period": EventTime(args.period, EventTime.Unit.US), + } + elif args.distribution == "fixed": + release_policy_args = { + "period": EventTime(args.period, EventTime.Unit.US), + "num_invocations": args.num_queries, + } + elif args.distribution == "poisson": + release_policy_args = { + "rate": args.variable_arrival_rate, + "num_invocations": args.num_queries, + } + elif args.distribution == "gamma": + release_policy_args = { + "rate": args.variable_arrival_rate, + "num_invocations": args.num_queries, + "coefficient": args.coefficient, + } + elif args.distribution == "fixed_gamma": + release_policy_args = { + "variable_arrival_rate": args.variable_arrival_rate, + "base_arrival_rate": args.base_arrival_rate, + "num_invocations": args.num_queries, + "coefficient": args.coefficient, + } + else: + raise NotImplementedError( + f"Release policy {args.distribution} not implemented." + ) + + release_policy = make_release_policy( + args.distribution, + release_policy_args, + rng, + args.rng_seed, + (args.randomize_start_time_min, args.randomize_start_time_max), + ) + + release_times = release_policy.get_release_times( + completion_time=EventTime(sys.maxsize, EventTime.Unit.US) + ) + + return release_times + + +def main(): + parser = argparse.ArgumentParser( + description="Generate a workload of queries based on distribution type." + ) + parser.add_argument( + "--spark-mirror-path", + type=Path, + required=True, + help="Path to spark-mirror repository", + ) + parser.add_argument( + "--spark-master-ip", + type=str, + required=True, + help="IP address of node running Spark master", + ) + parser.add_argument( + "--tpch-spark-path", + type=Path, + required=True, + help="Path to TPC-H Spark repository", + ) + parser.add_argument( + "--spark-eventlog-dir", + default=Path(os.getcwd()) / "spark-eventlog", + type=Path, + help="Path to directory in which to Spark event logs will be dumped", + ) + parser.add_argument( + "--distribution", + choices=["periodic", "fixed", "poisson", "gamma", "closed_loop", "fixed_gamma"], + default="gamma", + help="Type of distribution for query inter-arrival times (default: gamma)", + ) + parser.add_argument( + "--num_queries", + type=int, + default=50, + help="Number of queries to generate (default: 50)", + ) + parser.add_argument( + "--dataset_size", + choices=["50", "100", "250", "500"], + default="50", + help="Dataset size per query in GB (default: 50)", + ) + parser.add_argument( + "--max_cores", + type=int, + choices=[50, 75, 100, 200], + default=50, + help="Maximum executor cores (default: 50)", + ) + parser.add_argument( + "--period", + type=int, + default=25, + help="Releases a DAG after period time has elapsed", + ) + parser.add_argument( + "--variable_arrival_rate", + type=float, + default=1.0, + help="Variable arrival rate for poisson and gamma distributions", + ) + parser.add_argument( + "--coefficient", + type=float, + default=1.0, + help="Coefficient for poisson and gamma distributions", + ) + parser.add_argument( + "--base_arrival_rate", + type=float, + default=1.0, + help="Base arrival rate for fixed_gamma distribution", + ) + parser.add_argument("--randomize_start_time_min", type=int, default=0) + parser.add_argument("--randomize_start_time_max", type=int, default=0) + parser.add_argument( + "--rng_seed", + type=int, + default=1234, + help="RNG seed for generating inter-arrival periods and picking DAGs (default: 1234)", + ) + parser.add_argument( + "--queries", type=int, nargs="+", help="Launch specific queries" + ) + + args = parser.parse_args() + + if not args.spark_eventlog_dir.exists(): + args.spark_eventlog_dir.mkdir(parents=True) + + os.environ["TPCH_INPUT_DATA_DIR"] = str(args.tpch_spark_path.resolve() / "dbgen") + + if args.queries: + assert len(queries) == args.num_queries + + rng = random.Random(args.rng_seed) + + # Generate release times + release_times = generate_release_times(rng, args) + print("Release times:", release_times) + + # Launch queries + ps = [] + inter_arrival_times = [release_times[0].time] + for i in range(len(release_times) - 1): + inter_arrival_times.append(release_times[i + 1].time - release_times[i].time) + for i, inter_arrival_time in enumerate(inter_arrival_times): + time.sleep(inter_arrival_time) + if args.queries: + query_number = args.queries[i] + else: + query_number = rng.randint(1, 22) + ps.append(launch_query(query_number, i, args)) + print( + f"({i+1}/{len(release_times)})", + "Current time: ", + time.strftime("%Y-%m-%d %H:%M:%S"), + " launching query: ", + query_number, + ) + + for p in ps: + p.wait() + + # Wait for some time before sending the shutdown signal + time.sleep(20) + + channel = grpc.insecure_channel("localhost:50051") + stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel) + response = stub.Shutdown(erdos_scheduler_pb2.Empty()) + channel.close() + print("Sent shutdown signal to the service") + + +if __name__ == "__main__": + main() diff --git a/rpc/protos/erdos_scheduler.proto b/rpc/protos/rpc/erdos_scheduler.proto similarity index 98% rename from rpc/protos/erdos_scheduler.proto rename to rpc/protos/rpc/erdos_scheduler.proto index 494f5b49..262254da 100644 --- a/rpc/protos/erdos_scheduler.proto +++ b/rpc/protos/rpc/erdos_scheduler.proto @@ -47,6 +47,8 @@ service SchedulerService { /// Notifies the Scheduler that a Task from a particular TaskGraph has completed.option rpc NotifyTaskCompletion(NotifyTaskCompletionRequest) returns (NotifyTaskCompletionResponse) {} + + rpc Shutdown(Empty) returns (Empty) {} } @@ -199,4 +201,7 @@ message GetPlacementsResponse { bool success = 1; repeated Placement placements = 2; string message = 3; + bool terminate = 4; // terminate the task graph } + +message Empty {} diff --git a/rpc/service.py b/rpc/service.py index 2aaa2dc9..f4a7d504 100644 --- a/rpc/service.py +++ b/rpc/service.py @@ -1,39 +1,35 @@ -import asyncio -import heapq -import os +import threading import sys import time -from collections import defaultdict +import asyncio from concurrent import futures -from operator import attrgetter -from typing import Mapping, Sequence from urllib.parse import urlparse +from typing import Optional, Dict, Callable, Tuple +from enum import Enum +from dataclasses import dataclass -sys.path.append( - os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) -) - -import erdos_scheduler_pb2 -import erdos_scheduler_pb2_grpc -import grpc -from absl import app, flags -from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph - -from schedulers import EDFScheduler, FIFOScheduler -from utils import EventTime, setup_logging +# TODO: refactor out the need to import main to get common flags +import main +from simulator import Simulator, Event, EventTime, EventType from workers import Worker, WorkerPool, WorkerPools from workload import ( - ExecutionStrategies, - ExecutionStrategy, - Job, - Placement, Resource, Resources, - Task, - TaskGraph, Workload, - WorkProfile, + TaskGraph, + TaskState, + Placement, + JobGraph, ) +from data import BaseWorkloadLoader +from data.tpch_loader import TpchLoader +from utils import setup_logging, setup_csv_logging +from rpc import erdos_scheduler_pb2 +from rpc import erdos_scheduler_pb2_grpc + +import grpc + +from absl import app, flags FLAGS = flags.FLAGS @@ -41,13 +37,6 @@ flags.DEFINE_integer( "max_workers", 10, "Maximum number of workers to use for the RPC server." ) -flags.DEFINE_string("log_file", None, "Path to the log file.", short_name="log") -flags.DEFINE_string("log_level", "debug", "The level to log.") -flags.DEFINE_integer( - "initial_executors", - 10, - "The initial number of executors that are requested by each application.", -) flags.DEFINE_integer( "virtualized_cores", 500, @@ -63,809 +52,829 @@ "The amount of virtualized memory (in GB) that must be created in each Worker on " "the framework. Refer to the `virtualized_cores` flag for more information.", ) -flags.DEFINE_enum( - "scheduler", "EDF", ["FIFO", "EDF"], "The scheduler to use for this execution." +flags.DEFINE_integer( + "spark_app_num_initial_executors", + 10, + "The initial number of executors that are requested by each Spark application.", +) +flags.DEFINE_bool( + "override_worker_cpu_count", + False, + "If True, worker CPU count will be set to 640 (Cloudlab 20-node cluster CPU count). " + "This allows us to scale up spark experiments without actually deploying a large " + "spark cluster.", ) -# Define an item containing completion timestamp and task -class TimedItem: - def __init__(self, timestamp, task): - self.timestamp = timestamp - self.task = task +class DataLoader(Enum): + TPCH = "tpch" -# Define a priority queue based on heapq module -class PriorityQueue: - def __init__(self): - self._queue = [] +class WorkloadLoader(BaseWorkloadLoader): + def __init__(self, _flags) -> None: + self._workload = Workload.empty(_flags) - def put(self, item): - heapq.heappush(self._queue, (item.timestamp, item)) + def add_task_graph(self, task_graph: TaskGraph): + self._workload.add_task_graph(task_graph) - def get(self): - _, item = heapq.heappop(self._queue) - return item + def get_next_workload(self, current_time: EventTime) -> Optional[Workload]: + return self._workload - def empty(self): - return len(self._queue) == 0 +@dataclass +class RegisteredApplication: + """ + Represents a registered application that can be used to generate task + graphs. It also manages the mapping between Spark stage IDs and canonical + task IDs. -# Implement the service. -class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): - def __init__(self) -> None: - """Initialize the service, and setup the logger.""" - # Values used by the Servicer. - self._logger = setup_logging(name=FLAGS.log_file, log_level=FLAGS.log_level) - self._initialized = False - self._initialization_time = -1 - self._master_uri = None + A registered application is ready if the `task_graph` attribute is set. - # The simulator types maintained by the Servicer. - self._worker_pool = None - self._worker_pools = None - self._drivers: Mapping[str, Task] = {} - self._workload = None - - # Scheduler information maintained by the servicer. - self._scheduler_running_lock = asyncio.Lock() - self._scheduler_running = False - self._rerun_scheduler = False - if FLAGS.scheduler == "EDF": - self._scheduler = EDFScheduler() - elif FLAGS.scheduler == "FIFO": - self._scheduler = FIFOScheduler() - else: - raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") + Attributes: + gen (Callable[[EventTime], Tuple[TaskGraph, Dict[int,int]]]): + A function that takes a release time and outputs: + - A task graph + - A mapping from Spark stage IDs to canonical task IDs - # Placement information maintained by the servicer. - # The placements map the application IDs to the Placement retrieved from the - # scheduler. The placements are automatically clipped at the time of informing - # the framework of applying them to the executors. - # NOTE (Sukrit): This must always be sorted by the Placement time. - self._placements: Mapping[str, Sequence[Placement]] = defaultdict(list) + task_graph (TaskGraph, optional): + The generated task graph for the application. Defaults to None. - # Additional task information maintained by the servicer - self._tasks_marked_for_completion = PriorityQueue() + Methods: + generate_task_graph(release_time: EventTime): + Sets the `task_graph` attribute by generating a task graph for a + given `release_time`. - # Start the asyncio loop for clearing out pending tasks for completion - asyncio.create_task(self.PopTasksBasedOnTime()) + spark_task_id(task_id: int): + Returns the canonical task ID corresponding to a Spark stage ID. - super().__init__() + canonical_task_id(stage_id: int): + Returns the Spark stage ID corresponding to a canonical task ID. + """ - async def schedule(self) -> None: - """Schedules the tasks that have been added to the Workload.""" - async with self._scheduler_running_lock: - if self._scheduler_running: - self._logger.error( - "Scheduler already running, this should never be reached." - ) - return - self._scheduler_running = True - - current_time = EventTime(int(time.time()), EventTime.Unit.S) - self._logger.info( - "Starting a scheduling cycle with %s TaskGraphs and %s Workers at %s.", - len(self._workload.task_graphs), - len(self._worker_pool.workers), - current_time, - ) + gen: Callable[[EventTime], Tuple[TaskGraph, Dict[int, int]]] + task_graph: TaskGraph = None - # TODO (Sukrit): Change this to a better implementation. - # Let's do some simple scheduling for now, that gives a fixed number of - # executors to all the available applications in intervals of 10 seconds. - if len(self._workload.task_graphs) >= 2: - placements = self._scheduler.schedule( - sim_time=current_time, - workload=self._workload, - worker_pools=self._worker_pools, - ) - # Filter the placements that are not of type PLACE_TASK and that have not - # been placed. - filtered_placements = filter( - lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK - and p.is_placed(), - placements, - ) - for placement in sorted( - filtered_placements, key=attrgetter("placement_time") - ): - self._placements[placement.task.task_graph].append(placement) - # Schedule the task here since marking it as running requires it to be - # scheduled before. We mark it to be running when we inform the - # framework of the placement. - placement.task.schedule( - time=placement.placement_time, - placement=placement, - ) + _forward: Dict[int, int] = None # spark stage id => canonical task id + _backward: Dict[int, int] = None # canonical task id => spark stage id + _last_gen: EventTime = None + + def __init__(self, gen): + self.gen = gen + + def generate_task_graph(self, release_time: EventTime): + task_graph, stage_id_mapping = self.gen(release_time) + self.task_graph = task_graph + self._forward = stage_id_mapping + self._backward = {v: k for k, v in self._forward.items()} + self._last_gen = release_time + + def spark_task_id(self, task_id: int): + return self._backward[task_id] + + def canonical_task_id(self, stage_id: int): + return self._forward[stage_id] + + +class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): + def __init__(self, server) -> None: + self._server = server + + # Override some flags + + # Enable orchestrated mode + FLAGS.orchestrated = True + # Set scheduler runtime to zero + FLAGS.scheduler_runtime = 0 + + self._logger = setup_logging( + name=__name__, + log_dir=FLAGS.log_dir, + log_file=FLAGS.log_file_name, + log_level=FLAGS.log_level, + fmt="[%(asctime)s] {%(funcName)s:%(lineno)d} - %(message)s", + ) + self._csv_logger = setup_csv_logging( + name=__name__, + log_dir=FLAGS.log_dir, + log_file=FLAGS.csv_file_name, + ) + for flag_name in FLAGS: + self._csv_logger.debug( + f"input_flag,{flag_name},{getattr(FLAGS, flag_name)}" + ) - self._logger.info( - "Finished the scheduling cycle initiated at %s.", current_time + self._master_uri = None + self._initialization_time = None + self._data_loaders = {} + self._data_loaders[DataLoader.TPCH] = TpchLoader( + path=FLAGS.tpch_query_dag_spec, + flags=FLAGS, ) + self._simulator = None + self._workload_loader = None + + # Instantiate the scheduler based on the given flag. + self._scheduler = None + if FLAGS.scheduler == "FIFO": + from schedulers import FIFOScheduler + + self._scheduler = FIFOScheduler( + preemptive=FLAGS.preemption, + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, # TODO: (DG) Check why this isnt passed in the simulator + _flags=FLAGS, + ) + elif FLAGS.scheduler == "EDF": + from schedulers import EDFScheduler + + self._scheduler = EDFScheduler( + preemptive=FLAGS.preemption, + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) + elif FLAGS.scheduler == "TetriSched": + from schedulers import TetriSchedScheduler + + finer_discretization = FLAGS.finer_discretization_at_prev_solution + self._scheduler = TetriSchedScheduler( + preemptive=FLAGS.preemption, + runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US), + lookahead=EventTime(FLAGS.scheduler_lookahead, EventTime.Unit.US), + enforce_deadlines=FLAGS.enforce_deadlines, + retract_schedules=FLAGS.retract_schedules, + release_taskgraphs=FLAGS.release_taskgraphs, + goal=FLAGS.ilp_goal, + time_discretization=EventTime( + FLAGS.scheduler_time_discretization, EventTime.Unit.US + ), + plan_ahead=EventTime(FLAGS.scheduler_plan_ahead, EventTime.Unit.US), + log_to_file=FLAGS.scheduler_log_to_file, + adaptive_discretization=FLAGS.scheduler_adaptive_discretization, + _flags=FLAGS, + max_time_discretization=EventTime( + FLAGS.scheduler_max_time_discretization, EventTime.Unit.US + ), + max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold, + finer_discretization_at_prev_solution=finer_discretization, + finer_discretization_window=EventTime( + FLAGS.finer_discretization_window, EventTime.Unit.US + ), + plan_ahead_no_consideration_gap=EventTime( + FLAGS.scheduler_plan_ahead_no_consideration_gap, EventTime.Unit.US + ), + ) + else: + raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") - # Check if another run of the Scheduler has been requested, and if so, create - # a task for it. Otherwise, mark the scheduler as not running. - async with self._scheduler_running_lock: - self._scheduler_running = False - if self._rerun_scheduler: - self._rerun_scheduler = False - asyncio.create_task(self.schedule()) - - async def run_scheduler(self) -> None: - """Checks if the scheduler is running, and if not, starts it. - - If the scheduler is already running, we queue up another execution of the - scheduler. This execution batches the scheduling requests, and runs the - scheduler only once for all the requests.""" - async with self._scheduler_running_lock: - if not self._scheduler_running: - asyncio.create_task(self.schedule()) - else: - self._rerun_scheduler = True + # TODO: Items in _registered_applications are never deleted right now, needs to be handled. + self._registered_applications = {} + self._registered_app_drivers = ( + {} + ) # Spark driver id differs from taskgraph name (application id) + self._received_shutdown = False + self._shutting_down = False + self._lock = threading.Lock() + + super().__init__() async def RegisterFramework(self, request, context): - """Registers a new framework with the backend scheduler. - This is the entry point for a new instance of Spark / Flink to register - itself with the backend scheduler, and is intended as an EHLO. - """ - if self._initialized: - self._logger.warning( - "Framework already registered at %s with the address %s", - self._initialization_time, - self._master_uri, - ) + stime = self.__stime() + + if self.__framework_registered(): + msg = f"[{stime}] Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterFrameworkResponse( success=False, - message=f"Framework already registered at " - f"{self._initialization_time} at the address {self._master_uri}", + message=msg, ) - # Setup a new Framework instance. + t = int(time.time()) framework_name = request.name self._master_uri = request.uri - self._initialization_time = request.timestamp - self._initialized = True - self._logger.info( - "Registering framework %s with URI %s at %s", - framework_name, - self._master_uri, - self._initialization_time, - ) + self._initialization_time = EventTime(t, EventTime.Unit.US) + stime = self.__stime() - # Setup the simulator types. parsed_uri = urlparse(self._master_uri) - self._worker_pool = WorkerPool(name=f"WorkerPool_{parsed_uri.netloc}") - self._worker_pools = WorkerPools(worker_pools=[self._worker_pool]) - self._workload = Workload.from_task_graphs({}) + worker_pool = WorkerPool( + name=f"WorkerPool_{parsed_uri.netloc}", + _logger=self._logger, + ) + self._workload_loader = WorkloadLoader(FLAGS) + + self._simulator = Simulator( + scheduler=self._scheduler, + worker_pools=WorkerPools( + [worker_pool] + ), # Maintain only one worker pool in the simulator + workload_loader=self._workload_loader, + _flags=FLAGS, + ) - # Return the response. - return erdos_scheduler_pb2.RegisterFrameworkResponse( - success=True, - message=f"{framework_name} at {self._master_uri} registered successfully!", + msg = f"[{stime}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}" + self._logger.info(msg) + return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg) + + async def DeregisterFramework(self, request, context): + stime = self.__stime() + + if not self.__framework_registered(): + msg = f"[{stime}] Trying to deregister a framework at {request.uri} but no framework has been registered yet." + self._logger.error(msg) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, message=msg + ) + + if self._master_uri != request.uri: + msg = f"[{stime}] Trying to deregister the framework at {request.uri} but the registered framework is at {self._master_uri}" + self._logger.error(msg) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, message=msg + ) + + self._initialization_time = None + self._master_uri = None + self._workload_loader = None + self._simulator = None + msg = f"[{stime}] Successfully deregistered the framework at {request.uri}" + self._logger.info(msg) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=True, message=msg ) async def RegisterDriver(self, request, context): - if not self._initialized: - self._logger.warning( - "Trying to register a driver with name %s and id %s, " - "but no framework is registered yet.", - request.name, - request.id, + stime = self.__stime() + + if not self.__worker_registered(): + msg = f"[{stime}] Failed to register driver (id={request.id}) because no worker has been registered yet." + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message=msg, ) + + if request.id in self._registered_app_drivers: + msg = f"[{stime}] Driver with id '{request.id}' is already registered" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterDriverResponse( success=False, - message="Framework not registered yet.", - worker_id="", + message=msg, + worker_id=self.__get_worker_id(), ) - # Create a Task for the Driver, and add it to the list of drivers. - # TODO (Sukrit): We drop the memory requirements for now, we should use - # them to do multi-dimensional packing using STRL. - self._logger.info( - "Received a request to register a driver with name %s, URI: %s. " - "The driver requires %s cores and %s memory.", - request.id, - request.uri, - request.cores, - request.memory, - ) - driver_resources = Resources( - resource_vector={Resource(name="Slot_CPU", _id="any"): 1} - ) - driver_job = Job( - name=request.id, - profile=WorkProfile( - name=f"WorkProfile_{request.id}", - execution_strategies=ExecutionStrategies( - [ - ExecutionStrategy( - resources=driver_resources, - batch_size=1, - # NOTE (Sukrit): Drivers are long running, and have no - # fixed runtime. Setting it to zero helps us unload the - # driver from the Worker whenever we need it. - runtime=EventTime.zero(), - ) - ] - ), - ), + # TODO: Update the registered_app_drivers to map the driver id to + # application id once the taskgraph is registered. + self._registered_app_drivers[request.id] = None + + msg = ( + f"[{stime}] Successfully registered driver {request.id} for an application." ) - driver = Task( - name=request.id, - task_graph=request.uri, - job=driver_job, - deadline=EventTime.invalid(), + self._logger.info(msg) + return erdos_scheduler_pb2.RegisterDriverResponse( + success=True, + message=msg, + worker_id=self.__get_worker_id(), ) - self._drivers[request.id] = driver - - # Iterate over the Workers and find a Worker that can accomodate the driver. - placement_found = False - for worker in self._worker_pool.workers: - for execution_strategy in driver.available_execution_strategies: - if worker.can_accomodate_strategy(execution_strategy): - # This Worker can accomodate the Driver, we assign it here. - placement_found = True - self._worker_pool.place_task(driver, execution_strategy, worker.id) - - # Update the Task's state and placement information. - placement_time = EventTime(request.timestamp, EventTime.Unit.S) - driver.schedule( - time=placement_time, - placement=Placement( - type=Placement.PlacementType.PLACE_TASK, - computation=driver, - placement_time=placement_time, - worker_pool_id=self._worker_pool.id, - worker_id=worker.id, - strategy=execution_strategy, - ), - ) - driver.start(placement_time) - # Tell the framework to start the driver. - return erdos_scheduler_pb2.RegisterDriverResponse( - success=True, - message=f"Driver {request.id} registered successfully!", - worker_id=worker.name, - ) + async def DeregisterDriver(self, request, context): + stime = self.__stime() - if not placement_found: - return erdos_scheduler_pb2.RegisterDriverResponse( + if request.id not in self._registered_app_drivers: + msg = f"[{stime}] Driver id '{request.id}' is not registered or does not exist" + self._logger.error(msg) + return erdos_scheduler_pb2.DeregisterDriverResponse( success=False, - message=f"No Worker can accomodate the driver {request.id} yet.", - worker_id="", + message=msg, ) - async def DeregisterDriver(self, request, context): - if not self._initialized: - self._logger.warning( - "Trying to deregister a driver with id %s, " - "but no framework is registered yet.", - request.id, - ) - return erdos_scheduler_pb2.DeregisterDriverResponse( - success=False, message="Framework not registered yet." - ) + # TODO: Dummy mapping from driver to task graph (application), so task_graph_name is None. + # Deletion of taskgraph from registered_applications and driver from registered_app_drivers should be done carefully. + task_graph_name = self._registered_app_drivers[request.id] + del self._registered_app_drivers[request.id] - if request.id not in self._drivers: - self._logger.warning( - "Trying to deregister a driver with id %s, " - "but no driver with that id is registered.", - request.id, - ) - return erdos_scheduler_pb2.DeregisterDriverResponse( - success=False, - message=f"Driver with id {request.id} not registered yet.", + with self._lock: + log_stats_event = Event( + event_type=EventType.LOG_STATS, + time=self.__stime(), ) + self._simulator._event_queue.add_event(log_stats_event) + + msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}" + self._logger.info(msg) + + if len(self._registered_app_drivers) == 0 and self._received_shutdown: + self._logger.info(f"[{stime}] The last driver has been deregistered; finishing simulation") + # Signals _tick_simulator() to stop. Shouldn't be + # necessary in principle because after the with block + # ends, there shouldn't be any more events left to run, + # but doesn't hurt. + self._shutting_down = True + with self._lock: + self._simulator.simulate() + await self._server.stop(0) - # Deregister the driver. - driver = self._drivers[request.id] - completion_time = EventTime(request.timestamp, EventTime.Unit.S) - self._worker_pool.remove_task(completion_time, driver) - driver.finish(completion_time) - del self._drivers[request.id] return erdos_scheduler_pb2.DeregisterDriverResponse( success=True, - message=f"Driver with id {request.id} deregistered successfully!", + message=msg, ) async def RegisterTaskGraph(self, request, context): - """Registers a new TaskGraph with the backend scheduler. - This is the entry point for a new application of Spark to register - itself with the backend scheduler, and is intended as an EHLO. - """ - if not self._initialized: - self._logger.warning( - "Trying to register a task graph with ID %s and name %s, " - "but no framework is registered yet.", - request.id, - request.name, - ) + stime = self.__stime() + + if not self.__worker_registered(): + msg = f"[{stime}] Failed to register task graph (id={request.id}, name={request.name}) because no worker has been registered yet." + self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, message="Framework not registered yet.", num_executors=0 + success=False, message=msg, num_executors=0 ) - if request.id in self._workload.task_graphs: - self._logger.warning( - "The application with ID %s and name %s was already registered.", - request.id, - request.name, - ) + if request.id in self._registered_applications: + msg = f"[{stime}] The task graph (id={request.id}, name={request.name}) is already registered" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, - message=f"Application ID {request.id} with name {request.name} " - f"already registered!", - num_executors=0, - ) + success=False, message=msg, num_executors=0 + ) + + # We only support TPCH queries for now + if request.name.startswith("TPCH Query"): + # Parse request name + query_parts = request.name.split() + match query_parts: + case _, _, query_num, index, dataset_size, max_executors_per_job: + query_num = int(query_num) + dataset_size = int(dataset_size) + max_executors_per_job = int(max_executors_per_job) + case _, _, query_num, dataset_size, max_executors_per_job: + query_num = int(query_num) + # default index counts up from 0; incorrect if + # Spark receives jobs out of order + index = str(len(self._registered_applications)) + dataset_size = int(dataset_size) + max_executors_per_job = int(max_executors_per_job) + case _, _, query_num: + query_num = int(query_num) + index = str(len(self._registered_applications)) + dataset_size = FLAGS.tpch_dataset_size + max_executors_per_job = FLAGS.tpch_max_executors_per_job + case _: + msg = f"[{stime}] Invalid TPCH query request" + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) - self._logger.info( - "Attempting to register application ID %s with name %s", - request.id, - request.name, - ) - # Check if query is from TPC-H workload. - # If yes, retrieve profiled slots and runtime info. If no, use default values - is_tpch_query = False - tpch_query_all_stage_info = None - if request.name.startswith("TPCH_"): - is_tpch_query = True - # retrieve tasks-per-stage and runtime info based on query number - tpch_query_num = request.name.split("TPCH_Q", 1)[1] - tpch_query_all_stage_info = get_all_stage_info_for_query(tpch_query_num) - same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph( - query_num=tpch_query_num, dependencies=request.dependencies - ) - - # return failure message if not tpch app isnt of same DAG structure - if not same_structure: - self._logger.warning( - "TPCH application with ID %s and name %s couldn't be registered." - "DAG structure mismatch!", - request.id, - request.name, + # Convert request.dependencies to [{name: int, children: [int]}] + dependencies = [] + for dep in request.dependencies: + dependencies.append( + { + "name": int(dep.key.id), + "children": [int(c) for c in dep.children_ids], + } + ) + + # Create a job graph + self._logger.debug(str((query_num, index, dataset_size, max_executors_per_job))) + try: + job_graph, stage_id_mapping = self._data_loaders[ + DataLoader.TPCH + ].make_job_graph( + id=index, + query_num=query_num, + dependencies=dependencies, + dataset_size=dataset_size, + max_executors_per_job=max_executors_per_job, + runtime_unit=EventTime.Unit.US, ) + except Exception as e: + msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( - success=False, - message=f"TPCH application ID {request.id} with name {request.name}" - f" couldn't be registered. DAG structure mismatch!", - num_executors=0, + success=False, message=msg, num_executors=0 ) - # Construct all the Tasks for the TaskGraph. - task_ids_to_task: Mapping[int, Task] = {} - default_resource = Resources( - resource_vector={Resource(name="Slot_CPU", _id="any"): 20} - ) - default_runtime = EventTime(20, EventTime.Unit.US) - - for task_dependency in request.dependencies: - framework_task = task_dependency.key - if is_tpch_query: - mapped_stage_id = stage_id_mapping[framework_task.id] - task_slots = tpch_query_all_stage_info[mapped_stage_id]["num_tasks"] - task_runtime = tpch_query_all_stage_info[mapped_stage_id][ - "avg_task_duration" - ] - self._logger.info( - "Creating Task for given app TPCH stage: %s, mapped to " - "original stage id %s, with tasks: %s and avg runtime: %s", - framework_task.id, - mapped_stage_id, - task_slots, - task_runtime, + if not self.__can_accomodate_task_graph(job_graph): + msg = f"[{stime}] The worker Pool cannot accomodate the task graph '{request.id}'" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 ) - task_ids_to_task[framework_task.id] = Task( - name=framework_task.name, - task_graph=request.id, - job=Job( - name=framework_task.name, - profile=WorkProfile( - name=f"WorkProfile_{framework_task.name}", - execution_strategies=ExecutionStrategies( - [ - ExecutionStrategy( - resources=( - default_resource - if not is_tpch_query - else Resources( - resource_vector={ - Resource( - name="Slot_CPU", _id="any" - ): task_slots - } - ) - ), - batch_size=1, - runtime=( - default_runtime - if not is_tpch_query - else EventTime(task_runtime, EventTime.Unit.US) - ), - ) - ] - ), - ), - ), - deadline=EventTime(request.deadline, EventTime.Unit.S), - # TODO (Sukrit): We should maintain a counter for each application - # type so that we can correlate the Tasks with a particular invocation. - timestamp=1, - ) - # NOTE (Sukrit): We maintain the StageID of the Task as a separate field - # that is not accessible / used by the Simulator. - task_ids_to_task[framework_task.id].stage_id = framework_task.id - self._logger.info( - "Constructed Task %s for the TaskGraph %s.", - framework_task.name, - request.id, - ) - - # Construct the TaskGraph from the Tasks. - task_graph_structure: Mapping[Task, Sequence[Task]] = {} - for task_dependency in request.dependencies: - task_graph_structure[task_ids_to_task[task_dependency.key.id]] = [ - task_ids_to_task[task_id] for task_id in task_dependency.children_ids - ] - task_graph = TaskGraph( - name=request.id, - tasks=task_graph_structure, - ) - self._workload.add_task_graph(task_graph) - self._logger.info( - "Added the TaskGraph(name=%s, id=%s) to the Workload.", - request.name, - request.id, - ) - self._logger.info( - "The structure of the TaskGraph %s is \n%s.", - request.id, - str(task_graph), - ) - # Return the response. + def gen(release_time): + task_graph = job_graph.get_next_task_graph( + start_time=release_time, + _flags=FLAGS, + ) + return task_graph, stage_id_mapping + + else: + msg = f"[{stime}] The service only supports TPCH queries" + self._logger.error(msg) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message=msg, num_executors=0 + ) + + self._registered_applications[request.id] = RegisteredApplication(gen=gen) + + msg = f"[{stime}] Registered task graph '{request.id}' successfully" + self._logger.info(msg) return erdos_scheduler_pb2.RegisterTaskGraphResponse( success=True, - message=f"Application ID {request.id} with name " - f"{request.name} and deadline {request.deadline} registered successfully!", - num_executors=FLAGS.initial_executors, + message=msg, + num_executors=FLAGS.spark_app_num_initial_executors, ) async def RegisterEnvironmentReady(self, request, context): - """Registers that the environment (i.e., executors) are ready for the given - TaskGraph at the specified time. - - This is intended to release the sources of the TaskGraph to the scheduling - backend, to consider the application in this scheduling cycle. - """ - if not self._initialized: - self._logger.warning( - "Trying to register that the environment is ready for the TaskGraph " - "with ID %s, but no framework is registered yet.", - request.id, - ) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, message="Framework not registered yet." - ) - - task_graph = self._workload.get_task_graph(request.id) - if task_graph is None: - self._logger.warning( - "Trying to register that the environment is ready for the TaskGraph " - "with ID %s, but no TaskGraph with that ID is registered.", - request.id, - ) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=False, - message=f"TaskGraph with ID {request.id} not registered yet.", - ) + stime = self.__stime() - if request.num_executors != FLAGS.initial_executors: - self._logger.warning( - "The TaskGraph %s requires %s executors, but the environment is ready " - "with %s executors.", - request.id, - FLAGS.initial_executors, - request.num_executors, - ) + if request.id not in self._registered_applications: + msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=False, - message=f"Number of executors not {FLAGS.initial_executors}.", + message=msg, ) - # Release all the sources of the TaskGraph at the given time. - for source_task in task_graph.get_source_tasks(): - source_task.release(EventTime(request.timestamp, EventTime.Unit.S)) - - # Run the scheduler since the Workload has changed. - await self.run_scheduler() + r = self._registered_applications[request.id] + r.generate_task_graph(stime) - return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( - success=True, - message=f"Environment ready for TaskGraph with ID {request.id}!", - ) + with self._lock: + self._simulator._workload.add_task_graph(r.task_graph) + self._simulator._current_task_graph_placements[r.task_graph.name] = {} - async def DeregisterFramework(self, request, context): - """Deregisters the framework with the backend scheduler. - This is the exit point for a running instance of Spark / Flink to deregister""" - if not self._initialized: - self._logger.warning( - "Trying to deregister the framework at %s, " - "but no framework is registered yet.", - request.uri, - ) - return erdos_scheduler_pb2.DeregisterFrameworkResponse( - success=False, message="Framework not registered yet." - ) + for task in r.task_graph.get_releasable_tasks(): + task_release_event = Event( + event_type=EventType.TASK_RELEASE, + time=self.__stime(), + task=task, + ) + self._logger.info( + f"[{stime}] Added event {task_release_event} to the simulator's event queue", + ) + self._simulator._event_queue.add_event(task_release_event) - if not self._master_uri == request.uri: - self._logger.warning( - "Trying to deregister the framework at %s, " - "but the registered framework is at %s.", - request.uri, - self._master_uri, + scheduler_start_event = Event( + event_type=EventType.SCHEDULER_START, + time=self.__stime(), ) - return erdos_scheduler_pb2.DeregisterFrameworkResponse( - success=False, - message=f"Framework not registered at {request.uri} yet.", + self._simulator._event_queue.add_event(scheduler_start_event) + self._logger.info( + f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" ) - # Deregister the framework. - self._initialization_time = None - self._master_uri = None - self._initialized = False - self._logger.info("Deregistering framework at %s", request.uri) - return erdos_scheduler_pb2.DeregisterFrameworkResponse( + msg = f"[{stime}] Successfully marked environment as ready for task graph '{r.task_graph.name}'" + self._logger.info(msg) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( success=True, - message=f"Framework at {request.uri} deregistered successfully!", + message=msg, ) async def RegisterWorker(self, request, context): - """Registers a new worker with the backend scheduler.""" - if not self._initialized: - self._logger.warning( - "Trying to register a worker with name %s and id %s, " - "but no framework is registered yet.", - request.name, - request.id, - ) + stime = self.__stime() + + if not self.__framework_registered(): + msg = f"[{stime}] Trying to register a worker (id={request.id}, name={request.name}) but no framework is registered yet" + self._logger.error(msg) return erdos_scheduler_pb2.RegisterWorkerResponse( - success=False, message="Framework not registered yet." + success=False, message=msg ) - # First, we construct the Resources with the given size. - # TODO (Sukrit): Right now, we drop the memory requirements, we should use + # TODO(Sukrit): Right now, we drop the memory requirements, we should use # them to do multi-dimensional packing using STRL. - cpu_resource = Resource(name="Slot_CPU") - worker_resources = Resources(resource_vector={cpu_resource: request.cores}) - self._logger.debug( - "Successfully constructed the resources for the worker %s: %s.", - request.name, - worker_resources, - ) - # Construct a new Worker instance, and add it to the WorkerPool. + cpu_resource = Resource(name="Slot") + worker_resources = Resources( + resource_vector={ + cpu_resource: ( + request.cores if not FLAGS.override_worker_cpu_count else 640 + ) + }, + _logger=self._logger, + ) worker = Worker( name=request.id, resources=worker_resources, - ) - self._worker_pool.add_workers([worker]) - - self._logger.info( - "Registering worker with name %s, and resources %s.", - worker.name, - worker_resources, + _logger=self._logger, ) - # Run the scheduler since the Resource set has changed, and new task graphs - # may become eligible to run. - await self.run_scheduler() + self.__get_worker_pool().add_workers([worker]) + msg = f"[{stime}] Registered worker (id={request.id}, name={request.name})" + self._logger.info(msg) return erdos_scheduler_pb2.RegisterWorkerResponse( success=True, - message=f"Worker {request.name} registered successfully!", + message=msg, cores=FLAGS.virtualized_cores, memory=FLAGS.virtualized_memory * 1024, ) - async def NotifyTaskCompletion(self, request, context): - """Notifies the backend scheduler that a task has completed.""" - if not self._initialized: - self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " - "from application %s has completed, " - "but no framework is registered yet.", - request.task_id, - request.application_id, + async def GetPlacements(self, request, context): + stime = self.__stime() + + # Check if the task graph is registered + if request.id not in self._registered_applications: + msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist" + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, + message=msg, ) - return erdos_scheduler_pb2.NotifyTaskCompletionResponse( - success=False, message="Framework not registered yet." + + r = self._registered_applications[request.id] + + if r.task_graph is None: + msg = f"[{stime}] Task graph '{request.id}' is not ready" + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=msg, + placements=[], ) - task_graph = self._workload.get_task_graph(request.application_id) - if task_graph is None: - self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " - "from application %s has completed, but the application " - "was not registered with the backend yet.", - request.task_id, - request.application_id, + if r.task_graph.is_complete(): + msg = f"[{stime}] Task graph '{r.task_graph.name}' is complete. No more placements to provide." + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=msg, + ) + + # A task graph is considered complete if **all** of its **sink** tasks + # are complete. It is considered cancelled if **any** of its **sink** + # tasks are cancelled. + + # If the task graph is complete, the Spark application will + # automatically shut down because it knows that all of its stages have + # finished executing. + + # Matters get interesting in the presence of task cancellations. The + # service is aware of which tasks are cancelled. + + # First, even when a task graph is cancelled, the simulator (without + # orchestration) + # continues to schedule and execute any tasks that were released into + # the system. The service, which runs the simulator in orchestrated + # mode, must emulate this behavior to maintain parity. + + # Second, from Spark's point of view, however, those tasks are still + # pending placements. So, Spark will continue to periodically invoke + # `GetPlacements` in the hopes of receiving placements for those + # cancelled tasks. Left unhandled, the Spark application will loop + # indefinitely waiting for placements. + + # We _could_ communicate these task cancellations to Spark. Then, we + # can modify the DAGScheduler to invoke GetPlacements until all of its + # stages have either finished executing or have been cancelled, after + # which it can safely terminate the application. + + # However, we run into an issue due to VIRTUAL tasks. When a task is + # cancelled, the simulator invokes `TaskGraph.cancel(task)`. + # `TaskGraph.cancel(task)` traverses the tree rooted at `task` + # depth-first, cancelling tasks along the way until it finds the first + # terminal task. As a consequence, it is possible for the tree rooted + # at a cancelled task to have VIRTUAL tasks inside of it. These + # virtual tasks will never receive placements because they are not + # releasable. So, it is possible for the Spark application to stall on + # `GetPlacements` waiting on placements for these virtual tasks. + + # Since the service knows the state of each task, it is easy then for + # the service to determine when the Spark application should terminate + # in the presence of task cancellations. + + # So, instead of communicating task cancellations, we communicate when + # the Spark application should terminate. + # + # The first check makes sure all tasks are either CANCELLED, + # COMPLETED, or VIRTUAL. We check for all tasks because it is possible + # that the simulator is processing released and scheduled tasks. If we + # terminate early, then we will never receive `NotifyTaskCompletion`s + # for those tasks (because the Spark application was terminated), + # which then results in those tasks never getting removed from the + # worker pool. + # + # The second check makes sure that the task graph is indeed cancelled. + # We have this additional guard because at the start all tasks are + # VIRTUAL and we don't want to terminate the application then. + + if r.task_graph.is_cancelled(): + self._logger.error(f"[{stime}] Task graph '{r.task_graph.name}' is in state cancelled.") + + should_terminate = all( + task.state + in ( + TaskState.CANCELLED, + TaskState.COMPLETED, + TaskState.VIRTUAL, + ) + for task in r.task_graph + ) and (r.task_graph.is_cancelled()) + if should_terminate: + msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled and simulator has processed all released/ scheduled tasks. Terminating it since it has no more placements to provide." + self._logger.error(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=msg, + terminate=True, + ) + elif r.task_graph.is_cancelled() and not should_terminate: + msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled but simulator is still processing some released/ scheduled tasks. Will provide placements." + self._logger.error(msg) + else: + msg = f"[{stime}] Task graph '{r.task_graph.name}' is actively running. Will provide placements." + self._logger.info(msg) + + with self._lock: + sim_placements = self._simulator.get_current_placements_for_task_graph( + r.task_graph.name ) + + placements = [] + for placement in sim_placements: + if placement.task.state != TaskState.RUNNING: + self._logger.debug(f"[{stime}] Skipping placement: {placement}") + continue + + worker_id = ( + self.__get_worker_id() + if placement.placement_type == Placement.PlacementType.PLACE_TASK + else "None" + ) + task_id = r.spark_task_id(placement.task.name) + cores = ( + sum(x for _, x in placement.execution_strategy.resources.resources) + if placement.placement_type == Placement.PlacementType.PLACE_TASK + else 0 + ) + + if placement.placement_type not in (Placement.PlacementType.PLACE_TASK,): + raise NotImplementedError + + placements.append( + { + "worker_id": worker_id, + "application_id": request.id, + "task_id": task_id, + "cores": cores, + }, + ) + + msg = f"[{stime}] Returning the following placements {placements} for task graph '{request.id}'." + self._logger.info(msg) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + message=msg, + placements=placements, + ) + + async def NotifyTaskCompletion(self, request, context): + stime = self.__stime() + + # Check if the task graph is registered + if request.application_id not in self._registered_applications: + msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist" + self._logger.error(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, - message=f"Application with ID {request.application_id} " - f"not registered yet.", - ) - - # Find the Task that has completed, and mark it as such. - matched_task = None - for task in task_graph.get_nodes(): - if task.stage_id == request.task_id: - matched_task = task - if matched_task is None: - self._logger.warning( - "Trying to notify the backend scheduler that the task with ID %s " - "from application %s has completed, but the task " - "was not found in the TaskGraph.", - request.task_id, - request.application_id, + message=msg, ) + + r = self._registered_applications[request.application_id] + task = r.task_graph.get_task(r.canonical_task_id(request.task_id)) + if task is None: + msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{r.task_graph.name}'" + self._logger.error(msg) return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=False, - message=f"Task with ID {request.task_id} " - f"not found in TaskGraph {request.application_id}.", + message=msg, ) - # Instead of completing & removing the task immediately, check - # if it is actually complete or will complete in the future + if task.state != TaskState.RUNNING: + msg = f"[{stime}] Received task completion notification for task '{request.task_id}' (mapped to '{r.canonical_task_id(request.task_id)}') of '{r.task_graph.name}' but it is not running" + self._logger.error(msg) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=msg, + ) - # Get the actual task completion timestamp + # HACK: The worker pool doesn't step every tick (probably should). So, the task.remaining_time is not accurate. We compute actual_task_completion then by getting the runtime from the profile, actual_task_completion_time = ( - matched_task.start_time.time + matched_task.remaining_time.time - ) - - current_time = time.time() - self._logger.info( - "Received task for completion at time: %s , task.start_time: %s ," - "task.remaining_time (=runtime): %s , actual completion time: %s ", - round(current_time), - matched_task.start_time.time, - matched_task.remaining_time.time, - actual_task_completion_time, - ) - - # TODO DG: remaining_time assumes execution of the slowest strategy - # Should be updated to reflect correct remaining_time based on chosen strategy? - - # Add all tasks to _tasks_marked_for_completion queue. - # If task has actually completed, it will be dequeued immediately - # Else it will be dequeued at its actual task completion time - self._tasks_marked_for_completion.put( - TimedItem(actual_task_completion_time, matched_task) + task.start_time + task.slowest_execution_strategy.runtime ) - # NOTE: task.finish() and run_scheduler() invocations are postponed - # until it is time for the task to be actually marked as complete. - - return erdos_scheduler_pb2.NotifyTaskCompletionResponse( - success=True, - message=f"Task with ID {request.task_id} marked for completion at " - f"{round(current_time)}! It will be removed on actual " - f"task completion time at {actual_task_completion_time}", - ) - - async def GetPlacements(self, request, context): - """Retrieves the placements applicable at the specified time.""" - request_timestamp = EventTime(request.timestamp, EventTime.Unit.S) - if not self._initialized: - self._logger.warning( - "Trying to get placements for %s at time %s, " - "but no framework is registered yet.", - request.id, - request_timestamp, + with self._lock: + # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is + # inaccurate for task completion notifications that occur past that time. Thus, a max of the current and actual completion time + # is taken to ensure that the task is marked completed at the correct time. + task_finished_event = Event( + event_type=EventType.TASK_FINISHED, + time=max(actual_task_completion_time, self.__stime()), + task=task, ) - return erdos_scheduler_pb2.GetPlacementsResponse( - success=False, message="Framework not registered yet." + self._simulator._event_queue.add_event(task_finished_event) + self._logger.info( + f"[{stime}] Adding event {task_finished_event} to the simulator's event queue" ) + if actual_task_completion_time < self.__stime(): + self._logger.error( + f"[{stime}] Task '{request.task_id}' of task graph '{r.task_graph.name}' had exceeded its runtime by {self.__stime() - actual_task_completion_time}") - if request.id not in self._placements: - self._logger.warning( - "Trying to get placements for %s at time %s, but the application " - "was not registered with the backend yet.", - request.id, - request_timestamp, + scheduler_start_event = Event( + event_type=EventType.SCHEDULER_START, + time=max( + actual_task_completion_time.to(EventTime.Unit.US), + self.__stime(), + ), + ) + self._simulator._event_queue.add_event(scheduler_start_event) + self._logger.info( + f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue" ) - # Construct and return the placements., - placements = [] - clip_at = -1 - for index, placement in enumerate(self._placements[request.id]): - if placement.placement_time <= request_timestamp: - clip_at = index - # Mark the Task as RUNNING. - placement.task.start(request_timestamp) - - # resources = placement.execution_strategy.resources - placements.append( - erdos_scheduler_pb2.Placement( - worker_id=placement.worker_id, - application_id=request.id, - task_id=placement.task.stage_id, - cores=1, - ) - ) - self._placements[request.id] = self._placements[request.id][clip_at + 1 :] - self._logger.info( - "Constructed %s placements at time %s for application with ID %s.", - len(placements), - request.timestamp, - request.id, - ) - return erdos_scheduler_pb2.GetPlacementsResponse( + msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{r.task_graph.name}'" + self._logger.info(msg) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( success=True, - placements=placements, - message=f"Constructed {len(placements)} " - f"placements at time {request.timestamp}.", + message=msg, ) - # Function to pop tasks from queue based on actual completion time - async def PopTasksBasedOnTime(self): - while True: - if not self._tasks_marked_for_completion.empty(): - # Get the top item from the priority queue - top_item = self._tasks_marked_for_completion._queue[0][1] - - # Check if top item's timestamp is reached or passed by current time - current_time = time.time() - if top_item.timestamp <= current_time: - # Pop the top item - popped_item = self._tasks_marked_for_completion.get() - self._logger.info( - "Removing tasks from pending completion queue: %s at time: %s", - popped_item.task, - current_time, - ) + async def Shutdown(self, request, context): + self._received_shutdown = True + return erdos_scheduler_pb2.Empty() + + async def _tick_simulator(self): + while not self._shutting_down: + with self._lock: + if self._simulator is not None: + stime = self.__stime() + # self._logger.debug(f"[{stime}] Simulator tick") + self._simulator.tick(until=stime) + # else: + # print("Simulator instance is None") + await asyncio.sleep(1) + + def __stime(self) -> EventTime: + """ + Time as viewed by the service. Starts when a framework is registered + and ends when it is deregistered. + """ + if self._initialization_time is None: + return EventTime.invalid() + ts = int(time.time()) + # NOTE: The service runs in the US time unit for better compatibility with the simulator. + # The simulator uses an abstract unit of time, and it is all relative. + ts = EventTime(ts, EventTime.Unit.US) + return ts - self._initialization_time + + def __framework_registered(self): + return self._simulator is not None + + def __worker_registered(self): + return ( + self.__framework_registered() and len(self.__get_worker_pool().workers) > 0 + ) - # Mark the Task as completed. - # Also release the task from the scheduler service - popped_item.task.update_remaining_time(EventTime.zero()) - popped_item.task.finish( - EventTime(round(current_time), EventTime.Unit.S) - ) + def __get_worker_pool(self): + # Simulator maintains only one worker pool, so this should be fine + return next(iter(self._simulator._worker_pools.worker_pools)) - # Run the scheduler since the Workload has changed. - await self.run_scheduler() + def __get_worker(self): + return self.__get_worker_pool().workers[0] - else: - # If the top item's timestamp hasn't been reached yet, - # sleep for a short duration - await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s - else: - # If the queue is empty, sleep for a short duration - await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s + def __get_worker_id(self): + # We return the name here because we register the worker id from + # Spark as the name of the worker in the worker pool + return self.__get_worker().name + def __can_accomodate_task_graph(self, job_graph: JobGraph): + worker_resources = self.__get_worker().resources + for job in job_graph: + for strat in job.execution_strategies: + for resource, quantity in strat.resources.resources: + if worker_resources.get_total_quantity(resource) < quantity: + return False + return True -async def serve(): - """Serves the ERDOS Scheduling RPC Server.""" - # Initialize the server. - server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) - erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server( - SchedulerServiceServicer(), server - ) - # Start the server. - server.add_insecure_port(f"[::]:{FLAGS.port}") +async def serve(server): await server.start() - print("Initialized ERDOS Scheduling RPC Server on port", FLAGS.port) + print("Initialized ERDOS RPC Service on port", FLAGS.port) await server.wait_for_termination() -def main(argv): - # Create an asyncio event loop +def main(_argv): loop = asyncio.get_event_loop() - # Run the event loop until serve() completes + server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) + servicer = Servicer(server) + erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(servicer, server) + server.add_insecure_port(f"[::]:{FLAGS.port}") + + # Schedule the periodic tick_simulator task + loop.create_task(servicer._tick_simulator()) + try: - loop.run_until_complete(serve()) + loop.run_until_complete(serve(server)) + except KeyboardInterrupt: + print("Terminated ERDOS RPC Service") finally: loop.close() diff --git a/rpc/service_old.py b/rpc/service_old.py new file mode 100644 index 00000000..6629ebc7 --- /dev/null +++ b/rpc/service_old.py @@ -0,0 +1,1847 @@ +import asyncio +import heapq +import math +import os +import random +import sys +import time +from concurrent import futures +from operator import attrgetter +from typing import Dict, Mapping, Sequence +from urllib.parse import urlparse + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) + +import erdos_scheduler_pb2 +import erdos_scheduler_pb2_grpc +import grpc +from absl import app, flags +from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph + +from schedulers import EDFScheduler, FIFOScheduler, TetriSchedScheduler +from utils import EventTime, setup_logging +from workers import Worker, WorkerPool, WorkerPools +from workload import ( + ExecutionStrategies, + ExecutionStrategy, + Job, + Placement, + Resource, + Resources, + Task, + TaskGraph, + TaskState, + Workload, + WorkProfile, +) + +FLAGS = flags.FLAGS + +flags.DEFINE_integer("port", 50051, "Port to serve the ERDOS Scheduling RPC Server on.") +flags.DEFINE_integer( + "max_workers", 10, "Maximum number of workers to use for the RPC server." +) +flags.DEFINE_string("log_file_name", None, "Name of the log file.", short_name="log") +flags.DEFINE_string("log_level", "debug", "The level to log.") +flags.DEFINE_integer( + "initial_executors", + 10, + "The initial number of executors that are requested by each application.", +) +flags.DEFINE_float( + "spark_task_duration_multiplier", + 1, + "The multiplier used for spark job task runtimes. Buffer time is added " + "to ensure that tasks complete before the scheduler expects it to complete. " + "Completion of tasks after the scheduler's expected task completion time " + "is detrimental for scheduler's planning and could invalidate some schedules", +) +flags.DEFINE_integer( + "virtualized_cores", + 500, + "The number of virtualized cores that must be created in each Worker on the " + "framework. This allows us to spawn a higher number of executors than the number " + "possible with actual available resources. Thus, we can spawn the executors for " + "each application, and only selectively activate them according to the actual " + "available resources.", +) +flags.DEFINE_integer( + "virtualized_memory", + 500, + "The amount of virtualized memory (in GB) that must be created in each Worker on " + "the framework. Refer to the `virtualized_cores` flag for more information.", +) +flags.DEFINE_enum( + "scheduler", "DAGSched", ["FIFO", "EDF", "DAGSched"], "The scheduler to use for " + "this execution." +) +flags.DEFINE_enum( + "tpch_profile_type", "Cloudlab", ["Decima", "Cloudlab"], "The set of profiles to " + "use for execution of tpch queries. Note that Cloudlab profile has all 22 queries. " + "From the Decima profile we support only 15 queries (1-10, 12-14, 16, 19). The " + "rest might also run but DAG structure might not match Decima profiles." +) +flags.DEFINE_enum( + "tpch_dataset_size", "50", ["50", "100", "250", "500"], "Options in GB eg. 50g for " + "dataset size of TPCH query. The Cloudlab profile will be picked accordingly. " +) +flags.DEFINE_enum( + "tpch_max_executors_per_job", "50", ["50", "75", "100", "200"], "Options for " + "max executors to use for tpch queries. The Cloudlab profile will be picked " + "accordingly." +) +flags.DEFINE_bool( + "override_worker_cpu_count", + False, + "If True, worker CPU count will be set to INT_MAX. This allows us to scale up " + "spark experiments without actually deploying a large spark cluster.", +) +flags.DEFINE_bool( + "use_profile_to_scale_executors", + False, + "If True, it means that a fixed number of (max) executors was given to the " + "spark job to run. With this profile, we can directly use the profiled " + "stage runtime, while setting the number of required slots or executors " + "to 1 per stage. This allows us do the same scheduling but creates less " + "overhead for this rpc service while running the experiments.", +) +flags.DEFINE_bool( + "release_taskgraphs", + False, + "If True, all tasks from a graph are released if any of the tasks have " + "reached their release time.", +) +flags.DEFINE_bool( + "enforce_deadlines", + False, + "True if the ILP formulation must ensure that deadlines are met.", +) +flags.DEFINE_integer( + "scheduler_time_discretization", + 1, + "The length of each slot in the space-time matrix to consider for scheduling the " + "tasks (in µs). The default value is 1µs, and a higher value can lead to faster " + "solutions but a potentially lower goodput due to resources being blocked for the " + "entirety of the slot.", +) +flags.DEFINE_bool( + "scheduler_enable_optimization_pass", + False, + "If `True`, the scheduler runs pre/post-translation optimization passes" + "when registering STRL expression.", +) +flags.DEFINE_float( + "scheduler_reconsideration_period", + 0.1, + "The percentage of critical path duration until which the scheduler will try " + "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.", +) +flags.DEFINE_bool( + "retract_schedules", False, "Enable the retraction of previously decided schedules." +) +flags.DEFINE_integer( + "scheduler_time_limit", + 3, + "The time limit (in seconds) to allow the scheduler to keep " + "searching for solutions without finding a better one.", +) +flags.DEFINE_bool( + "scheduler_dynamic_discretization", + False, + "If `True`, the scheduler creates space-time matrix non-uniformly. " + "The discretization is dynamically decided based on the occupancy request for " + "each time slice. (default: False)", +) +flags.DEFINE_integer( + "scheduler_max_time_discretization", + 8, + "The maximum discretization that the scheduler can have (in µs). " + "Only used when scheduler_adaptive_discretization flag is enabled. (default: 8)", +) +flags.DEFINE_float( + "scheduler_max_occupancy_threshold", + 0.8, + "The percentage b/w 0 and 1 of maximum occupancy beyond which the discretization " + "would always be 1 incase of dynamic discretization. " + "This flag is only used when dynamic discretization is enabled (default: 0.8)", +) +flags.DEFINE_bool( + "finer_discretization_at_prev_solution", + False, + "If `True`, the scheduler keeps discretization of 1 around previous solution. " + "The discretization is dynamically decided based on the occupancy request for " + "each time slice. (default: False)", +) +flags.DEFINE_integer( + "finer_discretization_window", + 5, + "The window around previous solution that keeps discretization of 1.", +) +flags.DEFINE_bool( + "scheduler_selective_rescheduling", + False, + "If `True`, the supported schedulers will follow some pre-defined strategies for " + "selectively sampling TaskGraphs to reschedule.", +) +flags.DEFINE_integer( + "scheduler_plan_ahead_no_consideration_gap", + 4, + "The length of time gap (in µs) for which the reconsiderations are frozen. " + "From the current time to the consideration gap, any tasks placed will not be " + "reconsidered for rescheduling.", +) +flags.DEFINE_list( + "scheduler_log_times", + [], + "A list of timestamps (in µs) at which to request extra logging from the Scheduler." + "If scheduler_log_to_file is `True`, then extra information will be requested for " + "all timestamps.", +) +flags.DEFINE_integer( + "scheduler_selective_rescheduling_sample_size", + 5, + "If `scheduler_selective_rescheduling` is True, then this flag defines the number " + "of TaskGraphs to sample for rescheduling.", +) +flags.DEFINE_integer( + "min_task_graph_deadline_variance", + 10, + "The MIN percentage (additive) factor to be used with critical path length of the task graph. " + "This helps inform the deadline for the taskgraph and all tasks within the task " + "graph. The value be > 0 since the taskgraph would take atleast the critical path " + "time duration to complete.", +) +flags.DEFINE_integer( + "max_task_graph_deadline_variance", + 25, + "The MAX percentage (additive) factor to be used with critical path length of the task graph. " + "This helps inform the deadline for the taskgraph and all tasks within the task " + "graph. The value be > min_task_graph_deadline_variance since deadline is decided based on it.", +) +flags.DEFINE_bool( + "uniformly_sample_task_slots", + False, + "Enabling this ignores the TPCH profiled taskslots and uses a seeded, rng gerenated " + "num_tasks (= num_slots) for different stages of the TPCH job, uniformly sampled " + "in a range.", +) +flags.DEFINE_integer( + "random_seed", + random.randint(0, sys.maxsize), + "The seed to be used for random number generation. Defaults to a random number.", +) + +# Define an item containing completion timestamp and task +class TimedItem: + _next_id = 0 + _id_threshold = 99999 + + def __init__(self, timestamp, task): + self.timestamp = timestamp + self.task = task + self.id = TimedItem._next_id + TimedItem._next_id += 1 + + # Reset _next_id if it crosses the threshold + # We keep _next_id bounded to avoid very large numbers + # which could lead to slightly slower comparions + if TimedItem._next_id > TimedItem._id_threshold: + TimedItem._next_id = 0 + + def __lt__(self, other): + """Less than comparison for TimedItem instances.""" + if self.timestamp == other.timestamp: + # Unique ID for each TimedItem acts as tie-breaker + # for inserting into PriorityQueue + return self.id < other.id + return self.timestamp < other.timestamp + + def __eq__(self, other): + """Equality comparison for TimedItem instances.""" + return self.timestamp == other.timestamp and self.id == other.id + + +# Define a priority queue based on heapq module +class PriorityQueue: + def __init__(self): + self._queue = [] + + def put(self, item): + heapq.heappush(self._queue, (item.timestamp, item)) + + def get(self): + _, item = heapq.heappop(self._queue) + return item + + def empty(self): + return len(self._queue) == 0 + + +# Implement the service. +class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer): + def __init__(self) -> None: + """Initialize the service, and setup the logger.""" + # Values used by the Servicer. + self._logger = setup_logging( + name=__name__, + log_dir=FLAGS.log_dir, + log_file=FLAGS.log_file_name, + log_level=FLAGS.log_level + ) + self._initialized = False + self._initialization_time = -1 + self._last_step_up_time = EventTime.zero() + self._master_uri = None + + # The simulator types maintained by the Servicer. + self._worker_pool = None + self._worker_pools = None + self._drivers: Mapping[str, Task] = {} + self._workload = None + + # Track taskgraph completion progress. + self._total_taskgraphs_registered = 0 + self._total_taskgraphs_missed = 0 + self._total_taskgraphs_met = 0 + self._total_taskgraphs_cancelled = 0 + self._cancelled_taskgraphs = set() + self._min_task_graph_deadline_variance = FLAGS.min_task_graph_deadline_variance + self._max_task_graph_deadline_variance = FLAGS.max_task_graph_deadline_variance + + # Setting a rng for future use + self._rng = random.Random(FLAGS.random_seed) + + # Scheduler information maintained by the servicer. + self._scheduler_running_lock = asyncio.Lock() + self._scheduler_running = False + self._rerun_scheduler = False + self._scheduler_is_task_type = False + if FLAGS.scheduler == "EDF": + self._scheduler = EDFScheduler( + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) + self._scheduler_is_task_type = True + elif FLAGS.scheduler == "FIFO": + # NOTE: FIFO is supposed to be run as deadline unaware + self._scheduler = FIFOScheduler( + enforce_deadlines=FLAGS.enforce_deadlines, + _flags=FLAGS, + ) + self._scheduler_is_task_type = True + elif FLAGS.scheduler == "DAGSched": + # --scheduler=TetriSched + # --release_taskgraphs + # --enforce_deadlines + # --scheduler_time_discretization=1 ====> Conv to EventTime & passed through diff arg name + # --scheduler_enable_optimization_pass ====> Passed through _flags + # --retract_schedules + # --scheduler_dynamic_discretization ====> Passed through different argument name + # --scheduler_max_time_discretization=8 ====> Conv to EventTime & passed through diff arg name + # --scheduler_max_occupancy_threshold=0.999 ====> Passed through different argument name + # --finer_discretization_at_prev_solution + # --finer_discretization_window=4 + # --scheduler_selective_rescheduling (DISABLE) ====> Passed through _flags + # --scheduler_reconsideration_period=0.99 ====> Passed through _flags + + self._scheduler = TetriSchedScheduler( + release_taskgraphs=FLAGS.release_taskgraphs, + time_discretization=EventTime( + FLAGS.scheduler_time_discretization, EventTime.Unit.US + ), + _flags=FLAGS, + max_time_discretization=EventTime( + FLAGS.scheduler_max_time_discretization, EventTime.Unit.US + ), + enforce_deadlines=FLAGS.enforce_deadlines, + dynamic_discretization=FLAGS.scheduler_dynamic_discretization, + max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold, + retract_schedules=FLAGS.retract_schedules, + finer_discretization_at_prev_solution=( + FLAGS.finer_discretization_at_prev_solution + ), + finer_discretization_window=EventTime( + FLAGS.finer_discretization_window, EventTime.Unit.US + ), + plan_ahead_no_consideration_gap=EventTime( + FLAGS.scheduler_plan_ahead_no_consideration_gap, EventTime.Unit.US + ), + log_to_file=True, + ) + self._scheduler_is_task_type = not FLAGS.release_taskgraphs + else: + raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.") + + # Placement information maintained by the servicer. + # The placements map the application IDs to the Placement retrieved from the + # scheduler. + # NOTE: (DG) This is a new nested dict implementation. + # First level of dict is a mapping from app-id to all tasks in that app-id + # Second level of dict is a mapping from tasks to exact placement. + # TODO: (DG) This will no longer be ordered by time, so the check needs to be + # done for all tasks? Also, we might need to delete the placement once executed? + self._placements: Dict[str, Dict[str, Placement]] = {} + + # _executed_placements keep a track of previously completed placements since + # placements are deleted after being released. Can be used for debugging. + self._executed_placements: Dict[str, Placement] = {} + + # Additional task information maintained by the servicer + self._tasks_marked_for_completion = PriorityQueue() + + # Start the asyncio loop for clearing out pending tasks for completion + asyncio.create_task(self.PopTasksBasedOnTime()) + + super().__init__() + + async def schedule(self) -> None: + """Schedules the tasks that have been added to the Workload.""" + current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + + async with self._scheduler_running_lock: + if self._scheduler_running: + self._logger.error( + "[%s] Scheduler already running, this should never be reached.", + current_time, + ) + return + self._scheduler_running = True + + self._logger.info( + "[%s] Starting a scheduling cycle with %s TaskGraphs and %s Workers.", + current_time, + len(self._workload.task_graphs), + len(self._worker_pool.workers), + ) + + # Cumulate the resources from all the WorkerPools + for worker_pool in self._worker_pools.worker_pools: + worker_pool_resources = worker_pool.resources + for resource_name in set( + map(lambda value: value[0].name, worker_pool_resources.resources) + ): + resource = Resource(name=resource_name, _id="any") + self._logger.info( + f"{current_time},WORKER_POOL_UTILIZATION,{worker_pool.id}," + f"{resource_name}," + f"{worker_pool_resources.get_allocated_quantity(resource)}," + f"{worker_pool_resources.get_available_quantity(resource)}" + ) + + # Perform worker pool step + self._logger.info( + "[%s] Need to perform a step before schedule().", + current_time, + ) + completed_tasks = self.PerformWorkerPoolStep(sim_time=current_time) + + # Finish all tasks that have now completed + for completed_task in completed_tasks: + self.CleanupTaskExecution( + task=completed_task, + sim_time=current_time + ) + + + # TODO (Sukrit): Change this to a better implementation. + # Let's do some simple scheduling for now, that gives a fixed number of + # executors to all the available applications in intervals of 10 seconds. + if len(self._workload.task_graphs) >= 1: + scheduler_placements = self._scheduler.schedule( + sim_time=EventTime(current_time.time, EventTime.Unit.US), + workload=self._workload, + worker_pools=self._worker_pools, + ) + + # Filter the scheduler_placements that are now in CANCEL_TASK state. + cancel_task_placements = list(filter( + lambda p: p.placement_type == Placement.PlacementType.CANCEL_TASK, + scheduler_placements, + )) + self._logger.info( + "[%s] Received %s tasks to be cancelled: %s.", + current_time, + len(cancel_task_placements), + cancel_task_placements, + ) + # Issue task cancellations for identified tasks and taskgraphs so that + # the taskgraphs are no longer in consideration + for placement in cancel_task_placements: + # Update the task placement decision so that we can stop + # responding to RPC calls from its driver based on CANCEL_TASK type + + if placement.task.task_graph not in self._placements: + self._placements[placement.task.task_graph] = {} + self._logger.warning( + "[%s] Came to cancel a placement but taskgraph %s was not in " + "self._placements. Creating an empty dict entry.", + current_time, + placement.task.task_graph, + ) + self._placements[placement.task.task_graph][placement.task] = placement + self._logger.info( + "[%s] Added cancel placement to taskgraph %s for task %s. " + "Placement: %s", + current_time, + placement.task.task_graph, + placement.task, + placement, + ) + + # Since even one task getting cancelled, implies task-graph + # cancellation, we add the task-graph to cancelled set + if placement.task.task_graph not in self._cancelled_taskgraphs: + self._cancelled_taskgraphs.add(placement.task.task_graph) + self._total_taskgraphs_cancelled += 1 + self._logger.info( + "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + current_time, + self._total_taskgraphs_registered, + self._total_taskgraphs_met, + self._total_taskgraphs_missed, + self._total_taskgraphs_cancelled, + ) + + self._logger.info( + "[%s] Cancelling task: %s from taskgraph: %s", + current_time, + placement.task.name, + placement.task.task_graph, + ) + # Sending tasks to cancel. + placement.task.cancel( + time=current_time, + ) + + # Also cancel the task-graph so that all dependent tasks are removed + task_graph = self._workload.get_task_graph(placement.task.task_graph) + if task_graph is None: + self._logger.error("[%s] No TaskGraph found for %s", + current_time, + placement.task.task_graph, + ) + + for cancelled_task in task_graph.cancel(placement.task, current_time): + self._logger.info( + "[%s] Further cancelling dependent task: %s from taskgraph: %s", + current_time, + placement.task.name, + placement.task.task_graph, + ) + + cancelled_task.cancel( + time=current_time, + ) + + # TODO: (DG): Ensure that task-graph is removed from the workload and + # doesn't show up in the next iteration of tetrisched scheduler? + + # Filter the scheduler_placements that are not of type PLACE_TASK and + # have not been placed. + filtered_placements = filter( + lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK + and p.is_placed(), + scheduler_placements, + ) + for placement in sorted( + filtered_placements, key=attrgetter("placement_time") + ): + if placement.task.task_graph not in self._placements: + self._placements[placement.task.task_graph] = {} + self._logger.info( + "[%s] Want to add a placement but taskgraph %s was not in " + "self._placements. Creating an empty dict entry.", + current_time, + placement.task.task_graph, + ) + if placement.task not in self._placements[placement.task.task_graph]: + self._logger.info( + "[%s] Adding new placement to taskgraph %s for task %s. " + "Placement: %s", + current_time, + placement.task.task_graph, + placement.task, + placement, + ) + else: + self._logger.info( + "[%s] Updating an existing placement in taskgraph %s for task %s. " + "Placement: %s", + current_time, + placement.task.task_graph, + placement.task, + placement, + ) + self._placements[placement.task.task_graph][placement.task] = placement + + # Schedule the task here since marking it as running requires it to be + # scheduled before. We mark it to be running when we inform the + # framework of the placement. + + # TODO: (DG) ASK - dont think tasks need to be marked as unscheduled on cancellation? + placement.task.schedule( + time=placement.placement_time, + placement=placement, + ) + + + # Handle task placements that have returned with unplaced tasks + unplaced_placements = filter( + lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK + and not p.is_placed(), + scheduler_placements, + ) + for placement in unplaced_placements: + if placement.task.task_graph not in self._placements: + self._logger.info( + "[%s] Taskgraph %s not found for task %s, couldn't invalidate " + "it or it was previously invalidated.", + current_time, + placement.task.task_graph, + placement.task, + ) + elif placement.task in self._placements[placement.task.task_graph]: + self._logger.info( + "[%s] Invalidated the placement (taskgraph %s and task %s)" + "from self._placements along with entire taskgraph.", + current_time, + placement.task.task_graph, + placement.task, + ) + for task in self._placements[placement.task.task_graph]: + self._logger.info( + "[%s] Invalidating the placement for task %s " + "from self._placements due to invalidation of %s.", + current_time, + task, + placement.task, + ) + # Unschedule the task + if task.state is TaskState.SCHEDULED: + task.unschedule(time=current_time) + else: + self._logger.warning( + "[%s] Could not unschedule since task %s was " + "found in state %s in during invalidation of %s.", + current_time, + task, + task.state, + placement.task, + ) + # delete the taskgraph at once since we cant change size + # of dict while iterating + del self._placements[placement.task.task_graph] + else: + self._logger.info( + "[%s] Couldn't invalidate placement (taskgraph %s and task %s)." + "It couldnt be found in self._placements.", + current_time, + placement.task.task_graph, + placement.task, + ) + + scheduler_end_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + self._logger.info( + "[%s] Finished the scheduling cycle initiated at %s.", + scheduler_end_time, + current_time, + ) + + # Check if another run of the Scheduler has been requested, and if so, create + # a task for it. Otherwise, mark the scheduler as not running. + async with self._scheduler_running_lock: + self._scheduler_running = False + self._logger.info("[%s] self._rerun_scheduler: %s.", + scheduler_end_time, + self._rerun_scheduler, + ) + if self._rerun_scheduler: + self._rerun_scheduler = False + asyncio.create_task(self.schedule()) + + async def run_scheduler(self) -> None: + """Checks if the scheduler is running, and if not, starts it. + + If the scheduler is already running, we queue up another execution of the + scheduler. This execution batches the scheduling requests, and runs the + scheduler only once for all the requests.""" + async with self._scheduler_running_lock: + if not self._scheduler_running: + asyncio.create_task(self.schedule()) + else: + self._rerun_scheduler = True + + async def RegisterFramework(self, request, context): + """Registers a new framework with the backend scheduler. + This is the entry point for a new instance of Spark / Flink to register + itself with the backend scheduler, and is intended as an EHLO. + """ + if self._initialized: + self._logger.warning( + "Framework already registered at %s with the address %s at %s", + self._initialization_time, + self._master_uri, + self._initialization_time, + ) + return erdos_scheduler_pb2.RegisterFrameworkResponse( + success=False, + message=f"Framework already registered at " + f"{self._initialization_time} at the address {self._master_uri}", + + # Setup a new Framework instance. + framework_name = request.name + self._master_uri = request.uri + self._initialization_time = EventTime(request.timestamp, EventTime.Unit.US) + self._initialized = True + self._logger.info( + "[%s] Registering framework %s with URI %s.", + self._initialization_time, + framework_name, + self._master_uri, + ) + + # Setup the simulator types. + parsed_uri = urlparse(self._master_uri) + self._worker_pool = WorkerPool( + name=f"WorkerPool_{parsed_uri.netloc}", + _logger=self._logger + ) + self._worker_pools = WorkerPools(worker_pools=[self._worker_pool]) + self._workload = Workload.from_task_graphs({}) + + # Return the response. + return erdos_scheduler_pb2.RegisterFrameworkResponse( + success=True, + message=f"{framework_name} at {self._master_uri} registered successfully!", + ) + + async def RegisterDriver(self, request, context): + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to register a driver with name %s and id %s, " + "but no framework is registered yet.", + sim_time, + request.name, + request.id, + ) + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message="Framework not registered yet.", + worker_id="", + ) + + # Create a Task for the Driver, and add it to the list of drivers. + # TODO (Sukrit): We drop the memory requirements for now, we should use + # them to do multi-dimensional packing using STRL. + self._logger.info( + "[%s] Received a request to register a driver with name %s, URI: %s. " + "The driver requires %s cores and %s memory.", + sim_time, + request.id, + request.uri, + request.cores, + request.memory, + ) + driver_resources = Resources( + resource_vector={Resource(name="Slot_CPU", _id="any"): 1}, + _logger=self._logger, + ) + driver_job = Job( + name=request.id, + profile=WorkProfile( + name=f"WorkProfile_{request.id}", + execution_strategies=ExecutionStrategies( + [ + ExecutionStrategy( + resources=driver_resources, + batch_size=1, + # NOTE (Sukrit): Drivers are long running, and have no + # fixed runtime. Setting it to zero helps us unload the + # driver from the Worker whenever we need it. + runtime=EventTime.zero(), + ) + ] + ), + ), + ) + driver = Task( + name=request.id, + task_graph=request.uri, + job=driver_job, + deadline=EventTime.invalid(), + _logger=self._logger, + ) + self._drivers[request.id] = driver + + # Iterate over the Workers and find a Worker that can accomodate the driver. + placement_found = False + for worker in self._worker_pool.workers: + for execution_strategy in driver.available_execution_strategies: + if worker.can_accomodate_strategy(execution_strategy): + # This Worker can accomodate the Driver, we assign it here. + placement_found = True + # self._worker_pool.place_task(driver, execution_strategy, worker.id) + + # Update the Task's state and placement information. + placement_time = sim_time + driver.schedule( + time=placement_time, + placement=Placement( + type=Placement.PlacementType.PLACE_TASK, + computation=driver, + placement_time=placement_time, + worker_pool_id=self._worker_pool.id, + worker_id=worker.id, + strategy=execution_strategy, + ), + ) + driver.start(placement_time) + + # Tell the framework to start the driver. + return erdos_scheduler_pb2.RegisterDriverResponse( + success=True, + message=f"[{sim_time}] Driver {request.id} registered successfully!", + worker_id=worker.name, + ) + + if not placement_found: + return erdos_scheduler_pb2.RegisterDriverResponse( + success=False, + message=f"[{sim_time}] No Worker can accomodate the driver {request.id} yet.", + worker_id="", + ) + + async def DeregisterDriver(self, request, context): + completion_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to deregister a driver with id %s, " + "but no framework is registered yet.", + completion_time, + request.id, + ) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=False, message="Framework not registered yet." + ) + + if request.id not in self._drivers: + self._logger.warning( + "[%s] Trying to deregister a driver with id %s, " + "but no driver with that id is registered.", + completion_time, + request.id, + ) + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=False, + message=f"[{completion_time}] Driver with id {request.id} not registered yet.", + ) + + # Deregister the driver. + driver = self._drivers[request.id] + # self._worker_pool.remove_task(completion_time, driver) + driver.finish(completion_time) + del self._drivers[request.id] + return erdos_scheduler_pb2.DeregisterDriverResponse( + success=True, + message=f"[{completion_time}] Driver with id {request.id} deregistered successfully!", + ) + + async def RegisterTaskGraph(self, request, context): + """Registers a new TaskGraph with the backend scheduler. + This is the entry point for a new application of Spark to register + itself with the backend scheduler, and is intended as an EHLO. + """ + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to register a task graph with ID %s and name %s, " + "but no framework is registered yet.", + sim_time, + request.id, + request.name, + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, message="Framework not registered yet.", num_executors=0 + ) + + if request.id in self._workload.task_graphs: + self._logger.warning( + "[%s] The application with ID %s and name %s was already registered.", + sim_time, + request.id, + request.name, + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, + message=f"[{sim_time}] Application ID {request.id} with name " + f"{request.name} already registered!", + num_executors=0, + ) + + self._logger.info( + "[%s] Attempting to register application ID %s with name %s", + sim_time, + request.id, + request.name, + ) + # Check if query is from TPC-H workload. + # If yes, retrieve profiled slots and runtime info. If no, use default values + is_tpch_query = False + tpch_query_all_stage_info = None + if request.name.startswith("TPCH Query"): + is_tpch_query = True + # retrieve tasks-per-stage and runtime info based on query specifications + # Split the string by spaces + query_parts = request.name.split() + + # Initialize dataset_size and max_executor variables with default + tpch_query_num = None + tpch_dataset_size = int(FLAGS.tpch_dataset_size) + tpch_max_executors_per_job = int(FLAGS.tpch_max_executors_per_job) + + # Check if the string has the required format + # Format 1: "TPCH Query " + # Format 2: "TPCH Query " + if len(query_parts) >= 3 and query_parts[0] == "TPCH" and query_parts[1] == "Query": + tpch_query_num = int(query_parts[2]) + + # If dataset size and max cores are provided + if len(query_parts) >= 5: + tpch_dataset_size = int(query_parts[3]) + tpch_max_executors_per_job = int(query_parts[4]) + + tpch_query_all_stage_info = get_all_stage_info_for_query( + query_num=tpch_query_num, + profile_type=FLAGS.tpch_profile_type, + dataset_size=tpch_dataset_size, + max_executors=tpch_max_executors_per_job) + + same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph( + query_num=tpch_query_num, dependencies=request.dependencies + ) + + # return failure message if not tpch app isnt of same DAG structure + if not same_structure: + self._logger.warning( + "[%s] TPCH application with ID %s and name %s couldn't be registered." + "DAG structure mismatch!", + sim_time, + request.id, + request.name, + ) + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=False, + message=f"[{sim_time}] TPCH application ID {request.id} with name {request.name}" + f" couldn't be registered. DAG structure mismatch!", + num_executors=0, + ) + + # Construct all the Tasks for the TaskGraph. + task_ids_to_task: Mapping[int, Task] = {} + default_resource = Resources( + resource_vector={Resource(name="Slot_CPU", _id="any"): 20}, + _logger=self._logger, + ) + default_runtime = EventTime( + math.ceil(20 * FLAGS.spark_task_duration_multiplier), + EventTime.Unit.US + ) + + for i, task_dependency in enumerate(request.dependencies): + framework_task = task_dependency.key + if is_tpch_query: + mapped_stage_id = stage_id_mapping[framework_task.id] + + # NOTE: task_slots and task_runtime given to scheduler might be updated + # based on tpch_max_executors_per_job. If task_slots > + # tpch_max_executors_per_job, we transform (task_slots * task_runtime) + # as tpch_max_executors_per_job * ( + # (task_slots * task_runtime)/tpch_max_executors_per_job + # ) + # TODO: (DG) It is not foolproof since scheduler can give more than + # tpch_max_executors_per_job to app if it decides to run multiple + # independent stages in parallel + + profiled_task_slots = ( + tpch_query_all_stage_info[mapped_stage_id]["num_tasks"] + ) + + # Profiled runtime (in ms) * duration_multiplier is converted + # to nearest second + profiled_task_runtime = math.ceil( + ( + tpch_query_all_stage_info[mapped_stage_id] + ["avg_task_duration_ms"]/1000 + ) * FLAGS.spark_task_duration_multiplier + ) + + if FLAGS.uniformly_sample_task_slots: + # Chosen to override profiled tasks slots for TPCH + # TODO: (DG) The (20,60) range is outside default max_executors + # set to 50. Need to update code to correctly use max_executors later + # TODO: (DG) Don't like that seed is now going to change the dag structure + # everytime a new app arrives in the workload. + # Induces variability but seems weird. + # NOTE: tpch_max_ececutors is 50 but we will sample upto 70. + task_slots = self._rng.randint(30, 70) + else: + task_slots = (profiled_task_slots + if profiled_task_slots <= tpch_max_executors_per_job + else tpch_max_executors_per_job + ) + + # TODO: (DG) Adjust runtime if using uniformly_sample_task_slots + # Currently, runtimes still being calculated based on profiled_task_slots + # Setting minimum task_runtime to 8s to allow stages to complete + task_runtime = max(8, ( + profiled_task_runtime + if profiled_task_slots <= tpch_max_executors_per_job + else math.ceil( + (profiled_task_slots * + profiled_task_runtime)/tpch_max_executors_per_job) + ) + ) + if profiled_task_slots > tpch_max_executors_per_job: + self._logger.info( + "[%s] Profiled slots > tpch_max_executors_per_job: %s. Converted " + "(slots,runtime) from (%s,%s) to (%s, %s)", + sim_time, + tpch_max_executors_per_job, + profiled_task_slots, + profiled_task_runtime, + task_slots, + task_runtime, + ) + + self._logger.info( + "[%s] Creating Task for given app TPCH stage: %s, mapped to " + "original stage id %s, with tasks: %s and avg runtime (s): %s. " + "Used multiplier: %s", + sim_time, + framework_task.id, + mapped_stage_id, + task_slots, + task_runtime, + FLAGS.spark_task_duration_multiplier, + ) + task_ids_to_task[framework_task.id] = Task( + name=f"task_{framework_task.name}_{i}", + task_graph=request.id, + job=Job( + name=f"job_{framework_task.name}_{i}", + profile=WorkProfile( + name=f"WorkProfile_{framework_task.name}", + execution_strategies=ExecutionStrategies( + [ + ExecutionStrategy( + resources=( + default_resource + if not is_tpch_query + else Resources( + resource_vector={ + Resource( + name="Slot_CPU", _id="any" + ): task_slots + }, + _logger=self._logger, + ) + ), + batch_size=1, + runtime=( + default_runtime + if not is_tpch_query + else EventTime(task_runtime, EventTime.Unit.US) + ), + ) + ] + ), + ), + ), + # NOTE: (DG) Removed setting deadline here and will set deadline + # based on taskgraphs critical path instead. + deadline=EventTime.invalid(), + + # TODO (Sukrit): We should maintain a counter for each application + # type so that we can correlate the Tasks with a particular invocation. + timestamp=1, + _logger=self._logger, + ) + # NOTE (Sukrit): We maintain the StageID of the Task as a separate field + # that is not accessible / used by the Simulator. + task_ids_to_task[framework_task.id].stage_id = framework_task.id + self._logger.info( + "[%s] Constructed Task %s for the TaskGraph %s.", + sim_time, + framework_task.name, + request.id, + ) + + # Construct the TaskGraph from the Tasks. + task_graph_structure: Mapping[Task, Sequence[Task]] = {} + for task_dependency in request.dependencies: + task_graph_structure[task_ids_to_task[task_dependency.key.id]] = [ + task_ids_to_task[task_id] for task_id in task_dependency.children_ids + ] + task_graph = TaskGraph( + name=request.id, + tasks=task_graph_structure, + ) + + # Calculating critical path time from task graph + critical_path = task_graph.get_longest_path( + weights=lambda task: (task.slowest_execution_strategy.runtime.time) + ) + critical_path_time = ( + sum( + [t.slowest_execution_strategy.runtime for t in critical_path], + start=EventTime.zero(), + ) + .to(EventTime.Unit.US) + .time + ) + + # Setting taskgraph and task deadlines using critical_path_time * deadline_variance_factor + deadline_variance_factor = 1.0 + ( + self._rng.randint( + self._min_task_graph_deadline_variance, + self._max_task_graph_deadline_variance + ) + )/100 + task_graph_slo_time = math.ceil( + critical_path_time * deadline_variance_factor + ) + + for task in task_graph.get_nodes(): + deadline = EventTime(sim_time.time + task_graph_slo_time, + unit=EventTime.Unit.US + ) + task.update_deadline(deadline) + + task_graph.to_dot(f"{request.id}.dot") + self._workload.add_task_graph(task_graph) + self._logger.info( + "[%s] Added the TaskGraph(name=%s, id=%s, deadline=%s, " + "critical_path_time = %s, task_graph_slo_time = %s, " + "deadline_variance_factor= %s) to the Workload.", + sim_time, + request.name, + request.id, + task_graph.deadline, + critical_path_time, + task_graph_slo_time, + deadline_variance_factor, + ) + self._logger.info( + "[%s] The structure of the TaskGraph %s is \n%s.", + sim_time, + request.id, + str(task_graph), + ) + + # Increment total number of taskgraphs registered. + self._total_taskgraphs_registered += 1 + + # Show current run statistics. + self._logger.info( + "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + sim_time, + self._total_taskgraphs_registered, + self._total_taskgraphs_met, + self._total_taskgraphs_missed, + self._total_taskgraphs_cancelled, + ) + + # Return the response. + # TODO: (DG) Might want to change the number of initial executors if it causes + # issues in scaled up expts + return erdos_scheduler_pb2.RegisterTaskGraphResponse( + success=True, + message=f"[{sim_time}] Application ID {request.id} with name " + f"{request.name} and deadline {task_graph.deadline} registered successfully!", + num_executors=FLAGS.initial_executors, + ) + + async def RegisterEnvironmentReady(self, request, context): + """Registers that the environment (i.e., executors) are ready for the given + TaskGraph at the specified time. + + This is intended to release the sources of the TaskGraph to the scheduling + backend, to consider the application in this scheduling cycle. + """ + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to register that the environment is ready for the TaskGraph " + "with ID %s, but no framework is registered yet.", + sim_time, + request.id, + ) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, message=f"[{sim_time}] Framework not registered yet." + ) + + task_graph = self._workload.get_task_graph(request.id) + if task_graph is None: + self._logger.warning( + "[%s] Trying to register that the environment is ready for the TaskGraph " + "with ID %s, but no TaskGraph with that ID is registered.", + sim_time, + request.id, + ) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, + message=f"[{sim_time}] TaskGraph with ID {request.id} not registered yet.", + ) + + if request.num_executors != FLAGS.initial_executors: + self._logger.warning( + "[%s] The TaskGraph %s requires %s executors, but the environment is ready " + "with %s executors.", + sim_time, + request.id, + FLAGS.initial_executors, + request.num_executors, + ) + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=False, + message=f"Number of executors not {FLAGS.initial_executors}.", + ) + + # Release all the sources of the TaskGraph at the given time. + for source_task in task_graph.get_source_tasks(): + source_task.release(sim_time) + + self._logger.info(f"[{sim_time}] Environment ready for TaskGraph with ID {request.id}!") + + # Run the scheduler since the Workload has changed. + await self.run_scheduler() + + return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse( + success=True, + message=f"[{sim_time}] Environment ready for TaskGraph with ID {request.id}!", + ) + + async def DeregisterFramework(self, request, context): + """Deregisters the framework with the backend scheduler. + This is the exit point for a running instance of Spark / Flink to deregister""" + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to deregister the framework at %s, " + "but no framework is registered yet.", + sim_time, + request.uri, + ) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, message=f"[{sim_time}] Framework not registered yet." + ) + + if not self._master_uri == request.uri: + self._logger.warning( + "[%s] Trying to deregister the framework at %s, " + "but the registered framework is at %s.", + sim_time, + request.uri, + self._master_uri, + ) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=False, + message=f"[{sim_time}] Framework not registered at {request.uri} yet.", + ) + + # Deregister the framework. + self._initialization_time = None + self._master_uri = None + self._initialized = False + self._logger.info("[%s] Deregistering framework at %s", sim_time, request.uri) + return erdos_scheduler_pb2.DeregisterFrameworkResponse( + success=True, + message=f"[{sim_time}] Framework at {request.uri} deregistered successfully!", + ) + + async def RegisterWorker(self, request, context): + """Registers a new worker with the backend scheduler.""" + current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to register a worker with name %s and id %s, " + "but no framework is registered yet.", + current_time, + request.name, + request.id, + ) + return erdos_scheduler_pb2.RegisterWorkerResponse( + success=False, message=f"[{current_time}] Framework not registered yet." + ) + + # First, we construct the Resources with the given size. + # TODO (Sukrit): Right now, we drop the memory requirements, we should use + # them to do multi-dimensional packing using STRL. + cpu_resource = Resource(name="Slot_CPU") + # TODO: (DG) Override the request.cores to avoid scaling up physical setup + worker_resources = Resources( + resource_vector={ + cpu_resource: request.cores if not FLAGS.override_worker_cpu_count + else 640 + }, + _logger=self._logger, + ) + self._logger.debug( + "[%s] Successfully constructed the resources for the worker %s: %s.", + current_time, + request.name, + worker_resources, + ) + + # Construct a new Worker instance, and add it to the WorkerPool. + worker = Worker( + name=request.id, + resources=worker_resources, + _logger=self._logger, + ) + self._worker_pool.add_workers([worker]) + + self._logger.info( + "[%s] Registering worker with name %s, and resources %s.", + current_time, + worker.name, + worker_resources, + ) + + # Run the scheduler since the Resource set has changed, and new task graphs + # may become eligible to run. + await self.run_scheduler() + + return erdos_scheduler_pb2.RegisterWorkerResponse( + success=True, + message=f"[{current_time}] Worker {request.name} registered successfully!", + cores=FLAGS.virtualized_cores, + memory=FLAGS.virtualized_memory * 1024, + ) + + async def NotifyTaskCompletion(self, request, context): + """Notifies the backend scheduler that a task has completed.""" + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to notify the backend scheduler that the task with ID %s " + "from application %s has completed, " + "but no framework is registered yet.", + sim_time, + request.task_id, + request.application_id, + ) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, message="Framework not registered yet." + ) + + task_graph = self._workload.get_task_graph(request.application_id) + if task_graph is None: + self._logger.warning( + "[%s] Trying to notify the backend scheduler that the task with ID %s " + "from application %s has completed, but the application " + "was not registered with the backend yet.", + sim_time, + request.task_id, + request.application_id, + ) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=f"[{sim_time}] Application with ID {request.application_id} " + f"not registered yet.", + ) + + # Find the Task that has completed, and mark it as such. + matched_task = None + for task in task_graph.get_nodes(): + if task.stage_id == request.task_id: + matched_task = task + if matched_task is None: + self._logger.warning( + "[%s] Trying to notify the backend scheduler that the task with ID %s " + "from application %s has completed, but the task " + "was not found in the TaskGraph.", + sim_time, + request.task_id, + request.application_id, + ) + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=False, + message=f"[{sim_time}] Task with ID {request.task_id} " + f"not found in TaskGraph {request.application_id}.", + ) + + # Instead of completing & removing the task immediately, check + # if it is actually complete or will complete in the future + + # Get the actual task completion timestamp + # actual_task_completion_time = ( + # matched_task.start_time.time + matched_task.remaining_time.time + # ) + actual_task_completion_time = ( + sim_time.time + matched_task.remaining_time.time + ) + + self._logger.info( + "[%s] Received task for completion. task.start_time: %s ," + "task.remaining_time: %s , actual completion time: %s. " + "Task details: %s", + sim_time.time, + matched_task.start_time.time, + matched_task.remaining_time.time, + actual_task_completion_time, + matched_task, + ) + + if sim_time.time > actual_task_completion_time: + self._logger.warning( + "[%s] Task exceeded actual completion time by %s, " + "Task details: %s", + sim_time.time, + (sim_time.time - actual_task_completion_time), + matched_task, + ) + + # TODO DG: remaining_time assumes execution of the slowest strategy + # Should be updated to reflect correct remaining_time based on chosen strategy? + + # Add all tasks to _tasks_marked_for_completion queue. + # If task has actually completed, it will be dequeued immediately + # Else it will be dequeued at its actual task completion time + self._tasks_marked_for_completion.put( + TimedItem(actual_task_completion_time, matched_task) + ) + + # NOTE: task.finish() and run_scheduler() invocations are postponed + # until it is time for the task to be actually marked as complete. + + return erdos_scheduler_pb2.NotifyTaskCompletionResponse( + success=True, + message=f"Task with ID {request.task_id} marked for completion at " + f"{sim_time}! It will be removed on actual " + f"task completion time at {actual_task_completion_time}", + ) + + async def GetPlacements(self, request, context): + """Retrieves the placements applicable at the specified time.""" + sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time + + if not self._initialized: + self._logger.warning( + "[%s] Trying to get placements for %s, " + "but no framework is registered yet.", + sim_time, + request.id, + ) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, message="Framework not registered yet." + ) + + if request.id not in self._placements: + self._logger.warning( + "[%s] Trying to get placements for %s, but the application " + "was not registered with the backend yet OR was cancelled.", + sim_time, + request.id, + ) + return erdos_scheduler_pb2.GetPlacementsResponse( + success=False, + message=f"[{sim_time}] Trying to get placements for " + f"{request.id}, but the application was not registered with the " + f"backend yet OR was cancelled." + ) + + # Construct and return the placements., + placements = [] + + # Keep track of app_ids and task_names to delete after placements are issued + to_delete = [] + + for task in self._placements[request.id].keys(): + task_placement = self._placements[request.id][task] + if task.state is TaskState.CANCELLED: + # Task cancelled, add to list to remove from self._placements + to_delete.append((request.id, task)) + else: + if task_placement.placement_time <= sim_time: + # TODO: (DG) Due to small dataset size, each stage automatically gets + # one data partition i.e. one task and one executor. But later for + # large datasets, we might leverage use_profile_to_scale_executors + # to modify the placement before it is sent + self._logger.info( + f"[{sim_time}] Going to set placement.task to run: {task_placement}" + ) + + # Mark the Task as RUNNING. + # Right now we don't run task.start() if + # task is already in RUNNING or CANCELLED state. + # Only SCHEDULED -> RUNNING transition is allowed. + if task.state == TaskState.SCHEDULED: + try: + # Initialize the task at the given placement time, + # and place it on the WorkerPool. + worker_pool = self._worker_pools.get_worker_pool( + task_placement.worker_pool_id + ) + assert ( + worker_pool is not None + ), f"No WorkerPool found with ID: {task_placement.worker_pool_id}." + + # Display worker pool utilization before placing task + # Cumulate the resources from all the WorkerPools + for worker_pool in self._worker_pools.worker_pools: + worker_pool_resources = worker_pool.resources + for resource_name in set( + map(lambda value: value[0].name, worker_pool_resources.resources) + ): + resource = Resource(name=resource_name, _id="any") + self._logger.info( + f"{sim_time},WORKER_POOL_UTILIZATION,{worker_pool.id}," + f"{resource_name}," + f"{worker_pool_resources.get_allocated_quantity(resource)}," + f"{worker_pool_resources.get_available_quantity(resource)}" + ) + + # Perform worker pool step + self._logger.info( + "[%s] Need to perform a step before place_task() for %s.", + sim_time, + task, + ) + completed_tasks = self.PerformWorkerPoolStep(sim_time=sim_time) + + # Finish all tasks that have now completed + for completed_task in completed_tasks: + self.CleanupTaskExecution( + task=completed_task, + sim_time=sim_time + ) + + # Place the task on the worker pool + if self._scheduler_is_task_type: + success = True + else: + success = worker_pool.place_task( + task, + execution_strategy=task_placement.execution_strategy, + worker_id=task_placement.worker_id, + ) + if success: + task.start(sim_time) + self._logger.info( + "[%s] Successfully started task: %s on worker_pool: %s", + sim_time, + task, + worker_pool, + ) + # resources = placement.execution_strategy.resources + placements.append( + erdos_scheduler_pb2.Placement( + worker_id=task_placement.worker_id, + application_id=request.id, + task_id=task_placement.task.stage_id, + cores=1, + ) + ) + + # Add to delete list for clearing placement after it has been released + to_delete.append((request.id, task)) + self._logger.debug( + "[%s] Added tuple (%s, %s) to to_delete list.", + sim_time, + request.id, + task, + ) + + # Add task_placement to executed_placements since it is now complete + self._executed_placements[task] = task_placement + else: + self._logger.warning( + "[%s] Could not start task: %s on worker_id: %s and execution strategy: %s", + sim_time, + task, + task_placement.worker_id, + task_placement.execution_strategy, + ) + except ValueError as e: + self._logger.error(f"[{sim_time}] start() errored for task: {task}") + self._logger.error(f"[{sim_time}] Error: {e}") + + # Remove issued placements from self._placements + for app_id, task_name in to_delete: + del self._placements[app_id][task_name] + self._logger.info( + "[%s] Removed placement (app_id=%s, task_name=%s) from self._placements", + sim_time, + app_id, + task_name, + ) + + self._logger.info( + "[%s] Constructed %s placements for application with ID %s.", + sim_time, + len(placements), + request.id, + ) + + # Run the scheduler since the Workload has changed. + await self.run_scheduler() + + return erdos_scheduler_pb2.GetPlacementsResponse( + success=True, + placements=placements, + message=f"[{sim_time}] Constructed {len(placements)} " + f"placements.", + ) + + # Function to pop tasks from queue based on actual completion time + async def PopTasksBasedOnTime(self): + while True: + if not self._tasks_marked_for_completion.empty(): + # Get the top item from the priority queue + top_item = self._tasks_marked_for_completion._queue[0][1] + + # Check if top item's timestamp is reached or passed by current time + current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time + if top_item.timestamp <= current_time.time: + # Pop the top item + popped_item = self._tasks_marked_for_completion.get() + self._logger.info( + "[%s] Removing task from pending completion queue. " + "Task details: %s. " + "Timestamp: %s", + current_time, + popped_item.task, + top_item.timestamp, + ) + + # Display worker pool utilization before removing task + # Cumulate the resources from all the WorkerPools + for worker_pool in self._worker_pools.worker_pools: + worker_pool_resources = worker_pool.resources + for resource_name in set( + map(lambda value: value[0].name, worker_pool_resources.resources) + ): + resource = Resource(name=resource_name, _id="any") + self._logger.info( + f"{current_time},WORKER_POOL_UTILIZATION,{worker_pool.id}," + f"{resource_name}," + f"{worker_pool_resources.get_allocated_quantity(resource)}," + f"{worker_pool_resources.get_available_quantity(resource)}" + ) + + # Perform worker pool step + self._logger.info( + "[%s] Need to perform a step before remove_task() for %s.", + current_time, + popped_item.task, + ) + completed_tasks = self.PerformWorkerPoolStep(sim_time=current_time) + # TODO: (DG) For simplicity, we only pop cleanup task state for a single + # popped-item in the loop at once. Later, we could cleanup all identified + # completed tasks here. + + if popped_item.task.state == TaskState.COMPLETED: + # It means that the task state was already cleaned up after another + # invocation of PerformWorkerPoolStep. Can skip here then. + self._logger.info( + "[%s] Task %s already in COMPLETED state while processing " + "in PopTasksBasedOnTime.", + current_time, + popped_item.task, + ) + else: + self._logger.info( + "[%s] PopTasksBasedOnTime invoking CleanupTaskExecution " + "for task %s", + current_time, + popped_item.task, + ) + self.CleanupTaskExecution(task=popped_item.task, + sim_time=current_time) + + # # Free the resources on the worker pool for the completed task + # task_placed_at_worker_pool = self._worker_pools.get_worker_pool( + # popped_item.task.worker_pool_id + # ) + # task_placed_at_worker_pool.remove_task( + # current_time=current_time, task=popped_item.task + # ) + + # # Mark the Task as completed. + # # Also release the task from the scheduler service + # popped_item.task.update_remaining_time(EventTime.zero()) + # popped_item.task.finish(current_time) + + # # TODO: (DG) Check change here + # released_tasks, cancelled_tasks = self._workload.notify_task_completion( + # task=popped_item.task, + # finish_time=current_time) + + # # TODO: (DG) Check change here + # for new_released_task in released_tasks: + # new_released_task.release(current_time) + + # # TODO: Might do for cancelled too + + # # Mark task graph completed + # task_graph = self._workload.get_task_graph(popped_item.task.task_graph) + # if task_graph is None: + # self._logger.error(f"[{current_time}] Taskgraph for task {popped_item.task} is None") + # raise RuntimeError(f"[{current_time}] Taskgraph for task {popped_item.task} is None") + # if task_graph.is_complete(): + # self._logger.info(f"[{current_time}] Finished task_graph {task_graph.name}") + # if task_graph.deadline < current_time: + # self._logger.info(f"[{current_time}] Missed deadline for task_graph {task_graph.name}") + # self._total_taskgraphs_missed += 1 + # else: + # self._logger.info(f"[{current_time}] Met deadline for task_graph {task_graph.name}") + # self._total_taskgraphs_met += 1 + # self._logger.info( + # "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + # current_time, + # self._total_taskgraphs_registered, + # self._total_taskgraphs_met, + # self._total_taskgraphs_missed, + # self._total_taskgraphs_cancelled, + # ) + + # Run the scheduler since the Workload has changed. + await self.run_scheduler() + + else: + # If the top item's timestamp hasn't been reached yet, + # sleep for a short duration + await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s + else: + # If the queue is empty, sleep for a short duration + await asyncio.sleep(0.1) # TODO: Can adjust value, curr=0.1s + + def PerformWorkerPoolStep(self, sim_time): + # Get time elapsed since last step up time + time_elapsed_since_last_step = ( + sim_time - self._last_step_up_time + ) + + # step up all tasks on the worker-pool to reflect correct remaining time + self._logger.info( + "[%s] Stepping for %s timesteps.", + sim_time, + time_elapsed_since_last_step, + ) + for worker_pool in self._worker_pools.worker_pools: + completed_tasks = worker_pool.step( + self._last_step_up_time, time_elapsed_since_last_step) + for task in completed_tasks: + self._logger.info( + "[%s] Task %s was now found complete.", + sim_time, + task, + ) + + # Update _last_step_up_time + self._last_step_up_time = sim_time + + return completed_tasks + + def CleanupTaskExecution(self, task, sim_time): + self._logger.info( + "[%s] Cleaning up task execution for task %s.", + sim_time, + task, + ) + + # Free the resources on the worker pool for the completed task + task_placed_at_worker_pool = self._worker_pools.get_worker_pool( + task.worker_pool_id + ) + task_placed_at_worker_pool.remove_task( + current_time=sim_time, task=task + ) + + # Mark the Task as completed. + # Also release the task from the scheduler service + task.update_remaining_time(EventTime.zero()) + task.finish(sim_time) + + released_tasks, cancelled_tasks = self._workload.notify_task_completion( + task=task, + finish_time=sim_time) + + for new_released_task in released_tasks: + new_released_task.release(sim_time) + + # TODO: Might do for cancelled too + + # Mark task graph completed + task_graph = self._workload.get_task_graph(task.task_graph) + if task_graph is None: + self._logger.error(f"[{sim_time}] Taskgraph for task {task} is None") + raise RuntimeError(f"[{sim_time}] Taskgraph for task {task} is None") + if task_graph.is_complete(): + self._logger.info(f"[{sim_time}] Finished task_graph {task_graph.name}") + if task_graph.deadline < sim_time: + self._logger.info(f"[{sim_time}] Missed deadline for task_graph {task_graph.name}") + self._total_taskgraphs_missed += 1 + else: + self._logger.info(f"[{sim_time}] Met deadline for task_graph {task_graph.name}") + self._total_taskgraphs_met += 1 + self._logger.info( + "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s", + sim_time, + self._total_taskgraphs_registered, + self._total_taskgraphs_met, + self._total_taskgraphs_missed, + self._total_taskgraphs_cancelled, + ) + + +async def serve(): + """Serves the ERDOS Scheduling RPC Server.""" + # Initialize the server. + server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers)) + erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server( + SchedulerServiceServicer(), server + ) + + # Start the server. + server.add_insecure_port(f"[::]:{FLAGS.port}") + await server.start() + print("Initialized ERDOS Scheduling RPC Server on port", FLAGS.port) + await server.wait_for_termination() + + +def main(argv): + # Parse the command-line flags + flags.FLAGS(argv) + + # Access the value of the flag + multiplier = flags.FLAGS.spark_task_duration_multiplier + override_worker_cpus = flags.FLAGS.override_worker_cpu_count + + # Your application logic here + print("Multiplier:", multiplier) + print("Override worker CPUs:", override_worker_cpus) + + # Create an asyncio event loop + loop = asyncio.get_event_loop() + + # Run the event loop until serve() completes + try: + loop.run_until_complete(serve()) + finally: + loop.close() + + +if __name__ == "__main__": + app.run(main) diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md new file mode 100644 index 00000000..fe8f3678 --- /dev/null +++ b/rpc/spark_erdos_setup.md @@ -0,0 +1,246 @@ +# Setup Instructions for Spark Mirror and ERDOS + +This README provides step-by-step instructions to set up the environment, compile the Spark Mirror, and build the ERDOS scheduling simulator. + +## Prerequisites +- Conda +- Git +- [Java Development Kit (JDK) 17.0.9](https://openjdk.org/) + +--- + +## Step 0A: Create Conda Environment +```bash +conda create -n python=3.10 +``` + +### Activate the environment: +```bash +conda activate +``` + +### If jdk17.0.9 isn't installed, install it for +```bash +conda install -c conda-forge openjdk=17.0.9 +``` + +## Step 0B: Setup TPCH (dataset, jar) workload +Build the dataset +```bash +cd /path/to/tpch-spark/dbgen + +make + +./dbgen +``` + +Running `./dbgen` above creates a dataset of scale factor `s` of `1` (default) i.e. 1GB. + +> NOTE: Had updated the scala version to 2.13.0 in tpch.sbt. The sbt version used was `1.9.7`. + +Next, we build the target for `tpch-spark`: +```bash +sbt package +``` + +> NOTE: In case of errors in building the target, check `openjdk` version. It should be `17` and not `21`. + + +## Step 1: Setup `spark-mirror` +Clone the repository with submodules +```bash +git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive +``` + +> NOTE: If the submodule was cloned earlier but has been updated since, `git fetch --all` will not be able to track those changes. To pull in updates +> from submodule's parent, run `git submodule update --init --recursive`. + +### Verify branch +Verify or set current branch `erdos-spark-integration` + +### Verify env variable `SPARK_HOME` +Verify or set `SPARK_HOME` to point to the correct location of `spark-mirror`. + +### Verify env variable `JAVA_HOME` +> NOTE: `JAVA_HOME` should automatically get set to `/path/to/anaconda3/envs//lib/jvm` + +### For first time compilation (entire package) +```bash +./build/sbt package +``` + +### For subsequent, quicker iterations +Start the interactive shell +```bash +./build/sbt +``` + +Switch to project spark-core +```bash +project core +``` + +Compile and then package +```bash +compile +package +``` + +### Fix guava versions for ERDOS-Spark integration +Fresh compile+package of spark adds `guava-14.0.1.jar` under `/path/to/spark_mirror/assembly/target/scala-2.13/jars/`. +This jar interferes with gRPC which requires a `guava-31` jar. To fix: +- Remove existing `guava-14` jar: `rm assembly/target/scala-2.13/jars/guava-14.0.1.jar` +- Run `./sbin/patch-erdos.sh` +- Verify `guava-31.0.1-jre.jar` exists under `assembly/target/scala-2.13/jars/` + +### Update `PATH` with spark bin files +```bash +export PATH=$PATH:/path/to/spark_mirror/bin +``` + +## Step 2: Compile ERDOS +> NOTE: The `erdos-scheduling-simulator` in Step 2 refers to the seperately cloned repository. It is not the `erdos-scheduling-simulator` submodule within +the `spark-mirror` repository. + +### Clone repo +```bash +git clone https://github.com/erdos-project/erdos-scheduling-simulator.git --recursive +``` + +### Install requirements for the package +```bash +pip install -r requirements.txt +``` + +### Set `GUROBI_DIR` +```bash +export GUROBI_DIR=/serenity/scratch/dgarg/gurobi/gurobi1003/linux64 +``` + +### Build inside schedulers/tetrisched/build/ +```bash +export CMAKE_INSTALL_MODE=ABS_SYMLINK + +cmake .. -DINSTALL_GTEST=OFF -DTBB_INSTALL=OFF +``` + +* Verify that python bindings are written to the new `` conda env and not some old env + +### Run make +```bash +make -j install +``` + +### Test that simulator works with `simple_av_workload` +> NOTE: Might need to create `experiments` sub-directory if it doesnt already exist +```bash +python3 main.py --flagfile=configs/simple_av_workload.conf > experiments/simple_av_workload_test.output +``` +The TaskGraph should complete and meet its deadline. + + +## Step 3: Spark-Erdos service functionality test +> NOTE: As in step 2, the `erdos-scheduling-simulator` here also refers to the seperately cloned repository. + +From the base directory: + +### Install the requirements +```bash +pip install -r rpc/requirements.txt +``` + +### Run protoc to generate the service and message definitions using +```bash +python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto +``` + +### Run the service +```bash +python -m rpc.service --enforce_deadlines --scheduler_runtime=0 +``` + +The above command uses the default argument values from the `service.py` and `main.py`. The default scheduler is `EDF`. Other options available for the +service are `FIFO` and `TetriSched`. The DSched scheduler is a specific instantiation of the `TetriSched` scheduler. The other schedulers can be run +as follows: + +#### To instantiate FIFO scheduler for the service: +```bash +python -m rpc.service --scheduler=FIFO --enforce_deadlines --scheduler_runtime=0 +``` + +#### To instantiate DSched scheduler for the service: +```bash +python -m rpc.service --scheduler=TetriSched --enforce_deadlines --scheduler_runtime=0 --release_taskgraphs --opt_passes=CRITICAL_PATH_PASS --opt_passes=CAPACITY_CONSTRAINT_PURGE_PASS --oppasses=DYNAMIC_DISCRETIZATION_PASS --retract_schedules --scheduler_max_occupancy_threshold=0.999 --finer_discretization_at_prev_solution --scheduler_selective_rescheduling --scheduler_reconsideration_period=0.6 --scheduler_time_discretization=1 --scheduler_max_time_discretization=5 --finer_discretization_window=5 --scheduler_log_to_file +``` + +### Run local tests for the erdos-spark service +> NOTE: Verify that `pytest` is installed in the ``. Else first do `pip install pytest`. Once installed, run the tests using: +```bash +pytest tests/test_service.py +``` + +## Step 4: Running ERDOS with Spark backend + +### Start the service +```bash +python -m rpc.service +``` +Refer to the above section to instantiate different schedulers for the service. + +> NOTE: Since we emulate a 20-node spark cluster on a single system, an additional flag `--override_worker_cpu_count` needs to be passed in the +> service launch command. + +### Start all components of the spark cluster +Run the following commands from the root directory of the `spark-mirror` repository. + +Also, verify that environment variable `SPARK_HOME` is set correctly to point to the path of `spark_mirror` + +* Start Spark Master +```bash +./sbin/start-master.sh --host --properties-file /path/to/spark_mirror/conf/.conf +``` + +* Start Spark Worker +```bash +./sbin/start-worker.sh spark://:7077 --properties-file /path/to/spark_mirror/conf/.conf +``` + +* Start Spark History Server +```bash +./sbin/start-history-server.sh --properties-file /path/to/spark_mirror/conf/.conf +``` + +At this point, the spark framework should be registered with the erdos-service. + +### Viewing spark cluster status +Start a ssh tunnel to the node hosting the spark cluster and access port `18080` using the command: +```bash +ssh -L 18080::18080 @ +``` + +Once this command succeeds, you can view the History Server on your laptop's browser at URL: `localhost:18080` + +> NOTE: Same process needs to be repeated to view Master and Worker UIs. They run on ports `8080` and `8081` respectively. + +### Submitting a test spark application +To be submitted from within the `tpch-spark` repo: +```bash +/path/to/spark_mirror/bin/spark-submit --deploy-mode cluster --master spark://:7077 --conf 'spark.port.maxRetries=132' --conf 'spark.eventLog.enabled=true' --conf 'spark.eventLog.dir=/path/to/event_log' --conf 'spark.sql.adaptive.enabled=false' --conf 'spark.sql.adaptive.coalescePartitions.enabled=false' --conf 'spark.sql.autoBroadcastJoinThreshold=-1' --conf 'spark.sql.shuffle.partitions=1' --conf 'spark.sql.files.minPartitionNum=1' --conf 'spark.sql.files.maxPartitionNum=1' --conf 'spark.app.deadline=120' --class 'main.scala.TpchQuery' target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar "4" "50" "50" +``` + +The above job submission is parameterized by `(DEADLINE, QUERY_NUM, DATASET_SIZE, MAX_CORES)`. An example input value for this tuple is +`(120, 4, 50, 50)`. +> Refer to `launch_expt_script.py` in `tpch-spark` for more details on eligible values for these parameters and how they are used. + +> NOTE: By default, env variable `TPCH_INPUT_DATA_DIR` will look for `dbgen` inside the current working directory. While it works for `spark-submit` +> issued from inside the `tpch-spark` repository, it needs to be explicitly set otherwise. + +Once submitted, review the application's runtime status on the Spark Web UI. + +### Shutdown cluster +* To stop the master and worker(s) after the experiment concludes, run: +```bash +./sbin/stop-all.sh +``` + +> NOTE: This command does not terminate the History Server process. \ No newline at end of file diff --git a/rpc/tpch_utils.py b/rpc/tpch_utils.py index ebc4e3cd..48b28f83 100644 --- a/rpc/tpch_utils.py +++ b/rpc/tpch_utils.py @@ -2,106 +2,45 @@ import ast import json +import yaml import os from typing import Mapping, Sequence import networkx as nx import numpy as np -HOME_TPCH_DIR = "../profiles/workload/tpch_decima/" -TPCH_SUBDIR = "2g/" +from data.tpch_loader import get_all_stage_info_for_query -class SetWithCount(object): - """ - allow duplication in set - """ - - def __init__(self): - self.set = {} - - def __contains__(self, item): - return item in self.set - - def add(self, item): - if item in self.set: - self.set[item] += 1 - else: - self.set[item] = 1 - - def clear(self): - self.set.clear() - - def remove(self, item): - self.set[item] -= 1 - if self.set[item] == 0: - del self.set[item] - - -def pre_process_task_duration(task_duration): - # remove fresh durations from first wave - clean_first_wave = {} - for e in task_duration["first_wave"]: - clean_first_wave[e] = [] - fresh_durations = SetWithCount() - # O(1) access - for d in task_duration["fresh_durations"][e]: - fresh_durations.add(d) - for d in task_duration["first_wave"][e]: - if d not in fresh_durations: - clean_first_wave[e].append(d) - else: - # prevent duplicated fresh duration blocking first wave - fresh_durations.remove(d) - - -def get_all_stage_info_for_query(query_num): - task_durations = np.load( - os.path.join( - HOME_TPCH_DIR, TPCH_SUBDIR, "task_duration_" + str(query_num) + ".npy" - ), - allow_pickle=True, - ).item() - - num_nodes = len(task_durations) - - stage_info = {} - - for n in range(num_nodes): - task_duration = task_durations[n] - e = next(iter(task_duration["first_wave"])) - # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]} - - num_tasks = len(task_duration["first_wave"][e]) + len( - task_duration["rest_wave"][e] - ) - - # remove fresh duration from first wave duration - # drag nearest neighbor first wave duration to empty spots - pre_process_task_duration(task_duration) - rough_duration = np.mean( - [i for t in task_duration["first_wave"].values() for i in t] - + [i for t in task_duration["rest_wave"].values() for i in t] - + [i for t in task_duration["fresh_durations"].values() for i in t] - ) - - curr_stage = { - "stage_id": n, - "num_tasks": num_tasks, - "avg_task_duration": round(rough_duration), - } - stage_info[n] = curr_stage - - return stage_info +TPCH_PARENT_DIR = "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/" def get_base_tpch_graph_structure(query_num): - # use query_num to read string from file - with open(os.path.join(HOME_TPCH_DIR, "query_dag.json")) as f: - tpch_query_json = json.load(f) + with open(os.path.join(TPCH_PARENT_DIR, "queries.yaml")) as f: + tpch_query_yaml = yaml.load(f, Loader=yaml.FullLoader) + + # Extract the graph structure for the given query number + query_graph = None + for graph in tpch_query_yaml["graphs"]: + if graph["name"] == f"Q{query_num}": + query_graph = graph["graph"] + break + + if query_graph is None: + raise ValueError(f"Query number {query_num} not found in the YAML file") + + # Convert the graph structure to a format suitable for nx.DiGraph + query_dependency = [] + for node in query_graph: + if "children" in node: + for child in node["children"]: + query_dependency.append((node["name"], child)) + else: + # Ensure each tuple has two elements by adding a dummy node + query_dependency.append((node["name"], None)) - # get query dependency from file - query_dependency = ast.literal_eval(tpch_query_json["query_number"][str(query_num)]) + # Remove any tuples where the second element is None + query_dependency = [edge for edge in query_dependency if edge[1] is not None] # convert job structure into a nx graph base_tpch_graph = nx.DiGraph(query_dependency) diff --git a/schedulers/tetrisched_scheduler.py b/schedulers/tetrisched_scheduler.py index 6cbeb425..3198faed 100644 --- a/schedulers/tetrisched_scheduler.py +++ b/schedulers/tetrisched_scheduler.py @@ -601,11 +601,9 @@ def schedule( # Construct the STRL expression. scheduler_start_time = time.time() if len(tasks_to_be_scheduled) > 0 and any( - # If there is a Task belonging to a TaskGraph that hasn't been previously - # considered for scheduling and belongs to a TaskGraph that hasn't been - # cancelled, then we run the scheduler. + # If there is a Task belonging to a TaskGraph that hasn't + # been cancelled, then we run the scheduler. task.state != TaskState.SCHEDULED - and task.task_graph not in self._previously_considered_task_graphs and task.task_graph not in cancelled_task_graphs for task in tasks_to_be_scheduled ): diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py new file mode 100644 index 00000000..5c540b68 --- /dev/null +++ b/scripts/run_service_experiments.py @@ -0,0 +1,258 @@ +import argparse +import subprocess +import time +import traceback +from pathlib import Path +from dataclasses import dataclass +from datetime import datetime + + +def bang(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): + cmd = [str(part) for part in cmd] + print(" ".join(cmd)) + if dry_run: + return + p = subprocess.Popen(cmd, stdout=stdout, stderr=stderr) + return p + + +def must(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE): + p = bang(cmd, dry_run, stdout, stderr) + if not dry_run: + if p.wait() != 0: + stdout, stderr = p.communicate() + raise Exception(f"Command failed. stdout: {stdout}. stderr: {stderr}.") + return p + + +@dataclass +class Service: + service_args: any + spark_mirror_path: Path + spark_master_ip: str + output_dir: Path + dry_run: bool + + _service = None + _master = None + _worker = None + + def __enter__(self): + log_file = self.output_dir / "service.log" + csv_file = self.output_dir / "service.csv" + + # launch service + with ( + open(self.output_dir / "service.stdout", "w") as f_out, + open(self.output_dir / "service.stderr", "w") as f_err, + ): + self._service = bang( + [ + *("python3", "-m", "rpc.service"), + *("--log_file_name", log_file), + *("--csv_file_name", csv_file), + *self.service_args, + ], + self.dry_run, + stdout=f_out, + stderr=f_err + ) + + # sleep for some time + if not self.dry_run: + time.sleep(3) + + try: + # launch spark master and worker + self._master = must( + [ + f"{self.spark_mirror_path}/sbin/start-master.sh", + *("--host", self.spark_master_ip), + *( + "--properties-file", + f"{self.spark_mirror_path}/conf/spark-dg-config.conf", + ), + ], + self.dry_run, + ) + self._worker = must( + [ + f"{self.spark_mirror_path}/sbin/start-worker.sh", + f"spark://{self.spark_master_ip}:7077", + *( + "--properties-file", + f"{self.spark_mirror_path}/conf/spark-dg-config.conf", + ), + ], + self.dry_run, + ) + except Exception as e: + self.clean() + raise e + + if not self.dry_run: + time.sleep(5) + + return self + + def wait(self): + self._service.wait() + + def clean(self): + if self._service: + self._service.terminate() + if self._master: + must([f"{self.spark_mirror_path}/sbin/stop-master.sh"], self.dry_run) + if self._worker: + must([f"{self.spark_mirror_path}/sbin/stop-worker.sh"], self.dry_run) + + def __exit__(self, type, value, traceback): + self.clean() + + +@dataclass +class Launcher: + launcher_args: any + spark_mirror_path: Path + spark_master_ip: str + tpch_spark_path: Path + output_dir: Path + dry_run: bool + + def launch(self): + with ( + open(self.output_dir / "launcher.stdout", "w") as f_out, + open(self.output_dir / "launcher.stderr", "w") as f_err, + ): + must( + [ + *("python3", "-u", "-m", "rpc.launch_tpch_queries"), + *self.launcher_args, + *("--spark-master-ip", self.spark_master_ip), + *("--spark-mirror-path", self.spark_mirror_path), + *("--tpch-spark-path", self.tpch_spark_path), + ], + self.dry_run, + stdout=f_out, + stderr=f_err, + ) + + +@dataclass +class Experiment: + name: str + service_args: any + launcher_args: any + + def run(self, args): + output_dir = args.output_dir / (self.name + '-' + datetime.now().isoformat()) + if not output_dir.exists(): + output_dir.mkdir(parents=True) + with open(output_dir / "service.args", "w") as f: + print(*self.service_args, sep='\n', file=f) + with open(output_dir / "launcher.args", "w") as f: + print(*self.launcher_args, sep='\n', file=f) + + with Service( + service_args=self.service_args, + spark_mirror_path=args.spark_mirror_path, + spark_master_ip=args.spark_master_ip, + output_dir=output_dir, + dry_run=args.dry_run, + ) as s: + Launcher( + launcher_args=self.launcher_args, + spark_mirror_path=args.spark_mirror_path, + spark_master_ip=args.spark_master_ip, + tpch_spark_path=args.tpch_spark_path, + output_dir=output_dir, + dry_run=args.dry_run, + ).launch() + s.wait() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--dry-run", + action="store_true", + help="Prints commands that will be executed for each experiment", + ) + parser.add_argument( + "--spark-mirror-path", + type=Path, + required=True, + help="Path to spark-mirror repository", + ) + parser.add_argument( + "--spark-master-ip", + type=str, + required=True, + help="IP address of node running Spark master", + ) + parser.add_argument( + "--tpch-spark-path", + type=Path, + required=True, + help="Path to TPC-H Spark repository", + ) + parser.add_argument("--output-dir", type=Path, default=Path("exp-output")) + args = parser.parse_args() + + if not args.output_dir.exists(): + args.output_dir.mkdir(parents=True) + + base_args = [ + "--enforce_deadlines", + "--override_worker_cpu_count", + ] + variance_args = [ + *("--min_deadline_variance", 10), + *("--max_deadline_variance", 25), + ] + edf_args = [ + *("--scheduler", "EDF"), + ] + dsched_args = [ + *("--scheduler", "TetriSched"), + "--release_taskgraphs", + *("--opt_passes", "CRITICAL_PATH_PASS"), + *("--opt_passes", "CAPACITY_CONSTRAINT_PURGE_PASS"), + *("--opt_passes", "DYNAMIC_DISCRETIZATION_PASS"), + "--retract_schedules", + *("--scheduler_max_occupancy_threshold", 0.999), + "--finer_discretization_at_prev_solution", + "--scheduler_selective_rescheduling", + *("--scheduler_reconsideration_period", 0.6), + *("--scheduler_time_discretization", 1), + *("--scheduler_max_time_discretization", 5), + *("--finer_discretization_window", 5), + *("--scheduler_plan_ahead_no_consideration_gap", 1), + ] + experiments = [ + Experiment( + name="dsched-q300-hard", + service_args=[ + *base_args, + *dsched_args, + *variance_args, + ], + launcher_args=[ + *("--num_queries", 300), + *("--variable_arrival_rate", 0.052), + ], + ), + ] + + for i, experiment in enumerate(experiments): + try: + print(f"=== {experiment.name} ({i+1}/{len(experiments)}) ===") + experiment.run(args) + print("=== done ===") + except Exception as e: + print(traceback.format_exc()) + print(f"Failed to run experiment '{experiment}'. Exception: '{e}'") + + +if __name__ == "__main__": + main() diff --git a/simulator.py b/simulator.py index 1ccf06fd..a752c58f 100644 --- a/simulator.py +++ b/simulator.py @@ -5,7 +5,7 @@ from enum import Enum from functools import total_ordering from operator import attrgetter, itemgetter -from typing import Mapping, Optional, Sequence +from typing import Mapping, Optional, Sequence, Callable, Dict, List import absl # noqa: F401 @@ -43,6 +43,7 @@ class EventType(Enum): SCHEDULER_FINISHED = 12 # Signifies the end of the scheduler loop. SIMULATOR_END = 13 # Signify the end of the simulator loop. LOG_UTILIZATION = 14 # Ask the simulator to log worker pool utilization. + LOG_STATS = 15 # Log simulator statistics def __lt__(self, other) -> bool: # This method is used to order events in the event queue. We prioritize @@ -223,6 +224,9 @@ def reheapify(self): def __len__(self) -> int: return len(self._event_queue) + def __str__(self) -> str: + return str(self._event_queue) + class Simulator(object): """A `Simulator` simulates the execution of the different tasks in the @@ -251,7 +255,7 @@ def __init__( self, worker_pools: WorkerPools, scheduler: BaseScheduler, - workload_loader: BaseWorkloadLoader, + workload_loader: BaseWorkloadLoader = None, loop_timeout: EventTime = EventTime(time=sys.maxsize, unit=EventTime.Unit.US), scheduler_frequency: EventTime = EventTime(time=-1, unit=EventTime.Unit.US), _flags: Optional["absl.flags"] = None, @@ -259,7 +263,7 @@ def __init__( if not isinstance(scheduler, BaseScheduler): raise ValueError("Scheduler must implement the BaseScheduler interface.") - if not isinstance(workload_loader, BaseWorkloadLoader): + if workload_loader and not isinstance(workload_loader, BaseWorkloadLoader): raise ValueError( "WorkloadLoader must implement the BaseWorkloadLoader interface." ) @@ -337,10 +341,14 @@ def event_representation_filter(record): self.__log_utilization(self._simulator_time) # Internal data. - self._last_scheduler_start_time = self._simulator_time + self._last_scheduler_start_time = EventTime.invalid() self._next_scheduler_event = None self._last_scheduler_placements: Optional[Placements] = None + # Stores current placements for tasks of a task graph + # task_graph => {task_id => placement} + self._current_task_graph_placements: Dict[str, Dict[str, Placement]] = {} + # A Cache from the TaskID to a future Placement event in the EventQueue. # The Simulator uses this bookkeeping to revoke / invalidate decisions made # by the past scheduler invocations. @@ -372,6 +380,14 @@ def event_representation_filter(record): self._finished_task_graphs = 0 self._missed_task_graph_deadlines = 0 + # Is the simulator orchestrated? + self._orchestrated = _flags.orchestrated + + # Minimum duration by which to push task placements + self._min_placement_push_duration = EventTime( + _flags.min_placement_push_duration, EventTime.Unit.US + ) + # Initialize the event queue. # To make the system continue working the loop, we add three events: # - SIMULATOR_START: A notional event start the simulator and log into the CSV. @@ -391,16 +407,20 @@ def event_representation_filter(record): sim_start_event, ) + if self._orchestrated: + return + # Second, create the UPDATE_WORKLOAD event to retrieve the latest Workload. - upate_workload_event = Event( - event_type=EventType.UPDATE_WORKLOAD, time=self._simulator_time - ) - self._event_queue.add_event(upate_workload_event) - self._logger.info( - "[%s] Added %s to the event queue.", - self._simulator_time.time, - upate_workload_event, - ) + if self._workload_loader: + upate_workload_event = Event( + event_type=EventType.UPDATE_WORKLOAD, time=self._simulator_time + ) + self._event_queue.add_event(upate_workload_event) + self._logger.info( + "[%s] Added %s to the event queue.", + self._simulator_time.time, + upate_workload_event, + ) # Third, create the SCHEDULER_START event to invoke the scheduler. sched_start_event = Event( @@ -465,19 +485,15 @@ def dry_run(self) -> None: ) def simulate(self) -> None: - """Run the simulator loop. + """Run the simulator loop to fixpoint. This loop requires the `Workload` to be populated with the `TaskGraph`s whose execution is to be simulated using the Scheduler. """ - # Run the simulator loop. - while True: - time_until_next_event = self._event_queue.peek().time - self._simulator_time - # If there are any running tasks, step through the execution of the - # Simulator until the closest remaining time. + def f(): + time_until_next_event = self.__time_until_next_event() running_tasks = self._worker_pools.get_placed_tasks() - if len(running_tasks) > 0: # There are running tasks, figure out the minimum remaining # time across all the tasks. @@ -496,20 +512,68 @@ def simulate(self) -> None: # the next event in the queue, step all workers until the # completion of that task, otherwise, handle the next event. if min_task_remaining_time < time_until_next_event: - self.__step(step_size=min_task_remaining_time) + step_size = min_task_remaining_time else: - # NOTE: We step here so that all the Tasks that are going - # to finish as a result of this step have their TASK_FINISHED - # events processed first before any future placement occurs - # that is decided prior. - self.__step(step_size=time_until_next_event) - if self.__handle_event(self._event_queue.next()): - break + step_size = time_until_next_event else: - # Step until the next event is supposed to be executed. - self.__step(step_size=time_until_next_event) - if self.__handle_event(self._event_queue.next()): - break + step_size = time_until_next_event + return None if time_until_next_event.is_invalid() else step_size + + self.__simulate_f(should_step=f) + + def tick(self, until: EventTime) -> None: + """Tick the simulator until the specified time""" + + def f(): + time_until_next_event = self.__time_until_next_event() + + if ( + not time_until_next_event.is_invalid() + and (time_until_next_event + self._simulator_time) <= until + ): + return time_until_next_event + + return None + + self.__simulate_f(should_step=f) + + def __simulate_f(self, should_step: Callable[None, Optional[EventTime]]) -> None: + """Steps the simulator while a predicate is satisfied. + + This method continuously advances the simulation by calling the + provided `should_step` function, which determines the size of each + simulation step. The simulation continues until `should_step` returns + None, indicating that stepping should stop. + + Args: + should_step (Callable[[EventTime], bool]): + A predicate function that determines the next step size for the simulation. + - If the function returns an EventTime value, the simulator steps by that amount. + - If the function returns None, the simulation stops. + """ + while True: + step_size = should_step() + if not step_size: + break + self.__step(step_size=step_size) + if self._event_queue.peek() and self.__handle_event( + self._event_queue.next() + ): + break + + def get_current_placements_for_task_graph( + self, task_graph_name: str + ) -> List[Placement]: + if task_graph_name not in self._current_task_graph_placements: + self._logger.warning(f"Cannot recognize task graph '{task_graph_name}'") + return [] + return list(self._current_task_graph_placements[task_graph_name].values()) + + def __time_until_next_event(self) -> EventTime: + if self._event_queue.peek(): + return self._event_queue.peek().time - self._simulator_time + else: + return EventTime.invalid() def __handle_scheduler_start(self, event: Event) -> None: """Handle the SCHEDULER_START event. The method invokes the scheduler, and adds @@ -518,6 +582,10 @@ def __handle_scheduler_start(self, event: Event) -> None: Args: event (`Event`): The event to handle. """ + + if self._last_scheduler_start_time == event.time: + return + # Log the required CSV information. currently_placed_tasks = self._worker_pools.get_placed_tasks() schedulable_tasks = self._workload.get_schedulable_tasks( @@ -669,6 +737,9 @@ def __create_events_from_task_placement_skip( task=cancelled_task, ) ) + self._current_task_graph_placements[placement.task.task_graph][ + placement.task.id + ] = placement if task_graph.is_cancelled(): released_tasks_from_new_task_graph = ( @@ -921,6 +992,10 @@ def count_placed_tasks(placements: Placements): ) ) + # NOP if there are no previous placements + if self._last_scheduler_placements is None: + return + num_placed = count_placed_tasks(self._last_scheduler_placements) num_unplaced = count_placed_tasks(self._last_scheduler_placements) - num_placed scheduler_runtime = event.time - self._last_scheduler_start_time @@ -1027,18 +1102,19 @@ def count_placed_tasks(placements: Placements): # Reset the available tasks and the last task placement. self._last_scheduler_placements = None - # The scheduler has finished its execution, insert an event for the next - # invocation of the scheduler. - next_sched_event = self.__get_next_scheduler_event( - event, - self._scheduler_frequency, - self._last_scheduler_start_time, - self._loop_timeout, - ) - self._event_queue.add_event(next_sched_event) - self._logger.info( - "[%s] Added %s to the event queue.", event.time.time, next_sched_event - ) + if not self._orchestrated: + # The scheduler has finished its execution, insert an event for the next + # invocation of the scheduler. + next_sched_event = self.__get_next_scheduler_event( + event, + self._scheduler_frequency, + self._last_scheduler_start_time, + self._loop_timeout, + ) + self._event_queue.add_event(next_sched_event) + self._logger.info( + "[%s] Added %s to the event queue.", event.time.time, next_sched_event + ) # Now that all the tasks are placed, ask the simulator to log the resource # utilization and quit later, if requested. @@ -1066,6 +1142,7 @@ def __handle_task_cancellation(self, event: Event) -> None: f"{event.task.timestamp},{event.task.id},{event.task.task_graph}," f"{event.task.slowest_execution_strategy.runtime.time}" ) + self.log_stats(event.time) # If the task already had a placement, we remove the placement from our queue. if event.task.id in self._future_placement_events: @@ -1149,8 +1226,13 @@ def __handle_task_finished(self, event: Event) -> None: task_placed_at_worker_pool = self._worker_pools.get_worker_pool( event.task.worker_pool_id ) + task_placed_at_worker_pool.remove_task(current_time=event.time, task=event.task) - event.task.finish() + + # Remove the task from it's task graph's current placements + del self._current_task_graph_placements[event.task.task_graph][event.task.id] + + event.task.finish(event.time) # Log the TASK_FINISHED event into the CSV. self._finished_tasks += 1 @@ -1170,13 +1252,21 @@ def __handle_task_finished(self, event: Event) -> None: if task_graph.deadline > event.time else event.time - task_graph.deadline ) + + # Remove task graph from current task graph placements map + del self._current_task_graph_placements[event.task.task_graph] + self._csv_logger.debug( f"{event.time.time},TASK_GRAPH_FINISHED,{task_graph.name}," f"{task_graph.deadline.to(EventTime.Unit.US).time}," f"{tardiness.to(EventTime.Unit.US).time}" ) + if task_graph.deadline < event.time: self._missed_task_graph_deadlines += 1 + + self.log_stats(event.time) + self._logger.info( "[%s] Finished the TaskGraph %s with a deadline %s at the " "completion of the task %s with a tardiness of %s.", @@ -1306,6 +1396,72 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: ), "Inconsistency in future placements." task_graph = workload.get_task_graph(task.task_graph) assert task_graph is not None, "Inconsistency in Task placement and Workload." + + # Subroutine to handle avoid automatic re-placement of tasks in the next timestep + # if they were unable to start either due to (i) parent task not finished or + # (ii) worker not ready. The sub-tree rooted at the task is unscheduled and will + # be placed again in the next run of the scheduler. + def unschedule_subtree_rooted_at_task(task): + # Find all dependent tasks rooted from given task to unschedule + def subtree_tasks_to_unschedule(task): + tasks_to_unschedule = [task] + for child_task in task_graph.get_children(task): + tasks_to_unschedule.extend(subtree_tasks_to_unschedule(child_task)) + return tasks_to_unschedule + + tasks_to_unschedule = subtree_tasks_to_unschedule(task) + self._logger.info("[%s] Going to unschedule tasks rooted from %s. " + "List of tasks that will be unscheduled are: %s", + event.time.time, + task, + tasks_to_unschedule) + for unschedule_task in tasks_to_unschedule: + if unschedule_task.id in self._future_placement_events: + future_placement_event = self._future_placement_events[ + unschedule_task.id + ] + if future_placement_event.time > event.time: + # Delete future event from event_queue and from future_placement_events + self._event_queue.remove_event(future_placement_event) + del self._future_placement_events[unschedule_task.id] + msg = ( + f"[{event.time.time}] Retrieved future placement event {future_placement_event} " + f"for task {unschedule_task} and removed it." + ) + self._logger.info(msg) + elif future_placement_event.time == event.time: + # Cannot delete from event_queue, as this event is likely being processed + del self._future_placement_events[unschedule_task.id] + msg = ( + f"[{event.time.time}] Removed future placement event {future_placement_event} " + f"for task {unschedule_task} at the same time." + ) + self._logger.info(msg) + else: + msg = ( + f"[{event.time.time}] Future placement event {future_placement_event} for task " + f"{unschedule_task} is in the past." + ) + self._logger.warning(msg) + + # Unschedule the task + if unschedule_task.state == TaskState.SCHEDULED: + unschedule_task.unschedule(event.time) + self._csv_logger.debug( + f"{event.time.time},TASK_UNSCHEDULED,{unschedule_task.name},{unschedule_task.timestamp}," + f"{unschedule_task.id},{unschedule_task.task_graph}" + ) + msg = ( + f"[{event.time.time}] Finished unscheduling of task {unschedule_task}." + ) + self._logger.info(msg) + else: + msg = ( + f"[{event.time.time}] Task {unschedule_task} was not in SCHEDULED state and was in " + f"{unschedule_task.state} state. Skip unscheduling." + ) + self._logger.info(msg) + if not task.is_ready_to_run(task_graph): if task.state == TaskState.CANCELLED or task_graph.is_cancelled(): # The Task was cancelled. Consume the event. @@ -1330,32 +1486,20 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: return else: # If the Task is not ready to run and wasn't cancelled, - # find the next possible time to try executing the task. - parent_completion_time = max( - parent.remaining_time for parent in task_graph.get_parents(task) - ) - next_placement_time = event.time + max( - parent_completion_time, EventTime(1, EventTime.Unit.US) - ) - next_placement_event = Event( - event_type=event.event_type, - time=next_placement_time, - task=event.task, - placement=event.placement, - ) - self._future_placement_events[task.id] = next_placement_event - self._event_queue.add_event(next_placement_event) + # unschedule the task and its subtree. self._logger.info( - "[%s] The Task %s was not ready to run, and has been pushed for " - "later placement at %s.", + "[%s] The Task %s was not ready to run. The task along with its " + "sub-tree will be unscheduled.", event.time.to(EventTime.Unit.US).time, task, - next_placement_time, ) self._csv_logger.debug( f"{event.time.time},TASK_NOT_READY,{task.name},{task.timestamp}," f"{task.id},{event.placement.worker_pool_id}" ) + + # Unschedule the task and its subtree rooted at this task. + unschedule_subtree_rooted_at_task(task) return # Initialize the task at the given placement time, and place it on # the WorkerPool. @@ -1363,6 +1507,7 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: assert ( worker_pool is not None ), f"No WorkerPool found with ID: {event.placement.worker_pool_id}." + success = worker_pool.place_task( task, execution_strategy=event.placement.execution_strategy, @@ -1387,27 +1532,27 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None: "[%s] Placed %s on %s.", event.time.time, task, worker_pool ) del self._future_placement_events[task.id] + self._current_task_graph_placements[task.task_graph][ + task.id + ] = event.placement else: - next_placement_time = event.time + EventTime(1, EventTime.Unit.US) - next_placement_event = Event( - event_type=event.event_type, - time=next_placement_time, - task=event.task, - placement=event.placement, - ) - self._event_queue.add_event(next_placement_event) - self._future_placement_events[task.id] = next_placement_event + # If the placement was not successful, send the sub-tree of the taskgraph + # rooted at this task back to its previous state. It allows the scheduler + # to re-schedule in its next run. self._logger.warning( - "[%s] Task %s cannot be placed on worker %s, pushing placement to %s.", + "[%s] Task %s couldn't be placed on worker %s. The task along with its " + "sub-tree will be unscheduled.", event.time.time, task, - worker_pool, - next_placement_time, + event.placement.worker_pool_id, ) self._csv_logger.debug( f"{event.time.time},WORKER_NOT_READY,{task.name},{task.timestamp}," f"{task.id},{event.placement.worker_pool_id}" ) + + # Unschedule the task and its subtree rooted at this task. + unschedule_subtree_rooted_at_task(task) def __handle_task_migration(self, event: Event) -> None: """Handles the TASK_MIGRATION event. This event must be followed by a @@ -1503,6 +1648,9 @@ def __handle_update_workload(self, event: Event) -> None: raise ValueError( f"__handle_update_workload called with event of type {event.type}." ) + if not self._workload_loader: + raise ValueError("UPDATE_WORKLOAD event enqueued without workload_loader") + updated_workload = self._workload_loader.get_next_workload( current_time=self._simulator_time ) @@ -1525,6 +1673,16 @@ def __handle_update_workload(self, event: Event) -> None: # Release the Tasks that have become available. releasable_tasks = self._workload.get_releasable_tasks() + + # Ignore non-source tasks, they get auto-released when the parent finishes + def is_source_task(task): + task_graph = self._workload.get_task_graph(task.task_graph) + return task_graph.is_source_task(task) + + releasable_tasks = [ + task for task in releasable_tasks if is_source_task(task) + ] + self._logger.info( "[%s] The WorkloadLoader %s has %s TaskGraphs that released %s tasks.", self._simulator_time.to(EventTime.Unit.US).time, @@ -1539,19 +1697,36 @@ def __handle_update_workload(self, event: Event) -> None: len(releasable_tasks), ) - # Add the TaskGraphRelease events into the system. + # Add task graph entry in self._current_task_graph_placements to + # track its task placements + # + # In addition to newly added task graphs, self._workload also + # contains all previously released task graphs. + # + # So, we guard the addition of the entry on two conditions: + # (1) The task graph doesn't have an entry (we don't want to + # nuke an existing one) + # (2) The task graph is not complete (we only keep the entry + # alive while the task graph is running to avoid a memory + # leak) for task_graph_name, task_graph in self._workload.task_graphs.items(): - event = Event( - event_type=EventType.TASK_GRAPH_RELEASE, - time=task_graph.release_time, - task_graph=task_graph_name, - ) - self._event_queue.add_event(event) - self._logger.info( - "[%s] Added %s to the event queue.", - self._simulator_time.to(EventTime.Unit.US).time, - event, - ) + if ( + task_graph_name not in self._current_task_graph_placements + and not task_graph.is_complete() + ): + self._current_task_graph_placements[task_graph_name] = {} + + event = Event( + event_type=EventType.TASK_GRAPH_RELEASE, + time=task_graph.release_time, + task_graph=task_graph_name, + ) + self._event_queue.add_event(event) + self._logger.info( + "[%s] Added %s to the event queue.", + self._simulator_time.to(EventTime.Unit.US).time, + event, + ) max_release_time = self._simulator_time for task in releasable_tasks: @@ -1577,7 +1752,8 @@ def __handle_update_workload(self, event: Event) -> None: else self._simulator_time + self._workload_update_interval ), ) - self._event_queue.add_event(next_update_event) + # TODO(elton): Handle this properly + # self._event_queue.add_event(next_update_event) self._logger.info( "[%s] Added %s to the event queue.", self._simulator_time.time, @@ -1657,17 +1833,16 @@ def __handle_event(self, event: Event) -> bool: self.__handle_scheduler_finish(event) elif event.event_type == EventType.SIMULATOR_END: # End of the simulator loop. + self.log_stats(event.time) self._csv_logger.debug( - f"{event.time.time},SIMULATOR_END,{self._finished_tasks}," - f"{self._cancelled_tasks},{self._missed_task_deadlines}," - f"{self._finished_task_graphs}," - f"{len(self._workload.get_cancelled_task_graphs())}," - f"{self._missed_task_graph_deadlines}" + f"{event.time.time},SIMULATOR_END", ) self._logger.info("[%s] Ending the simulator loop.", event.time.time) return True elif event.event_type == EventType.LOG_UTILIZATION: self.__log_utilization(event.time) + elif event.event_type == EventType.LOG_STATS: + self.log_stats(event.time) else: raise ValueError(f"[{event.time}] Retrieved event of unknown type: {event}") return False @@ -1680,7 +1855,9 @@ def __step(self, step_size: EventTime = EventTime(1, EventTime.Unit.US)) -> None the clock (in us). """ if step_size < EventTime.zero(): - raise ValueError(f"Simulator cannot step backwards {step_size}") + raise ValueError( + f"[{self._simulator_time}] Simulator cannot step backwards {step_size}" + ) # Step the simulator for the required steps and construct TASK_FINISHED events # for any tasks that were able to complete their execution. @@ -1707,13 +1884,14 @@ def __step(self, step_size: EventTime = EventTime(1, EventTime.Unit.US)) -> None self._simulator_time.time, [event.task.unique_name for event in task_finished_events], ) - for task_finished_event in task_finished_events: - self._event_queue.add_event(task_finished_event) - self._logger.info( - "[%s] Added %s to the event queue.", - self._simulator_time.time, - task_finished_event, - ) + if not self._orchestrated: + for task_finished_event in task_finished_events: + self._event_queue.add_event(task_finished_event) + self._logger.info( + "[%s] Added %s to the event queue.", + self._simulator_time.time, + task_finished_event, + ) def __get_next_scheduler_event( self, @@ -2006,9 +2184,18 @@ def __run_scheduler(self, event: Event) -> Event: f"Received no Placements object from the Scheduler at {event.time}.", ) - # Calculate the time at which the placements need to be applied. placement_time = event.time + placements.runtime + for placement in placements: + # If the placement is in the past, update it to match + # `placement_time` + # This scenario happens when the `scheduler_runtime` is non-zero. + if placement._placement_time and placement._placement_time < placement_time: + self._logger.warning( + f"[{self._simulator_time}] Placement is in the past. Updating placement time from {placement._placement_time} to {placement_time}" + ) + placement._placement_time = placement_time + # Save the placements until the placement time arrives. self._last_scheduler_placements = placements @@ -2038,3 +2225,12 @@ def __log_utilization(self, sim_time: EventTime): f"{worker_pool_resources.get_allocated_quantity(resource)}," f"{worker_pool_resources.get_available_quantity(resource)}" ) + + def log_stats(self, sim_time: EventTime): + self._csv_logger.debug( + f"{sim_time.time},LOG_STATS,{self._finished_tasks}," + f"{self._cancelled_tasks},{self._missed_task_deadlines}," + f"{self._finished_task_graphs}," + f"{len(self._workload.get_cancelled_task_graphs())}," + f"{self._missed_task_graph_deadlines}" + ) diff --git a/tests/test_service.py b/tests/test_service.py new file mode 100644 index 00000000..9f623445 --- /dev/null +++ b/tests/test_service.py @@ -0,0 +1,328 @@ +import re +import time +import subprocess + +import pytest +import grpc +from rpc import erdos_scheduler_pb2 +from rpc import erdos_scheduler_pb2_grpc + + +@pytest.fixture(scope="module", autouse=True) +def service(): + process = subprocess.Popen(["python", "-m", "rpc.service", "--enforce_deadlines"]) + channel = grpc.insecure_channel("localhost:50051") + try: + grpc.channel_ready_future(channel).result(timeout=5) + yield process + finally: + channel.close() + process.kill() + + +def test_service(): + channel = grpc.insecure_channel("localhost:50051") + stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel) + + # Register a framework + request = erdos_scheduler_pb2.RegisterFrameworkRequest( + name="test_framework", uri="http://localhost/test", timestamp=1234567890 + ) + response = stub.RegisterFramework(request) + assert response.success and re.search( + r"Registered the framework 'test_framework' with URI http://localhost/test at UNIX time", + response.message, + ) + + # Register a worker + request = erdos_scheduler_pb2.RegisterWorkerRequest( + name="test_worker", + id="1234", + cores=100, + memory=1024, + ) + response = stub.RegisterWorker(request) + assert response.success and re.search( + r"Registered worker \(id=1234, name=test_worker\)", response.message + ) + + # Try to fetch placements for an unregistered task graph + # Get placements for the task, should be empty + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-0", + ) + response = stub.GetPlacements(request) + assert not response.success and re.search( + r"Task graph with id \'task-graph-0\' is not registered or does not exist", + response.message, + ) + + # TODO: move to environment ready + # Register an incorrect TaskGraph + # request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + # id="task-graph", + # name="TPCH Query 4 50 50", + # timestamp=1234567890, + # dependencies=[ + # {"key": {"id": 0, "name": "stage 0"}, "children_ids": [1, 2]}, + # ], + # ) + # response = stub.RegisterTaskGraph(request) + # assert not response.success and re.search( + # r"Failed to load TPCH query 4. Exception: Structure of dependencies provided for query number 4 does not match that of canonical dependencies", + # response.message, + # ) + + # Register the first (correct) TaskGraph, it will be able to run + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph-0", + name="TPCH Query 4 50 50", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]}, + {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]}, + {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]}, + {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]}, + {"key": {"id": 4, "name": "stage 4"}, "children_ids": []}, + ], + ) + response = stub.RegisterTaskGraph(request) + assert ( + response.success + and re.search( + r"Registered task graph 'task-graph-0' successfully", + response.message, + ) + and response.num_executors == 10 + ) + + # Introduce a 2s delay in getting the env ready + time.sleep(2) + + # Mark the environment as ready + request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( + id="task-graph-0", + num_executors=10, + timestamp=1234567890, + ) + response = stub.RegisterEnvironmentReady(request) + assert response.success and re.search( + r"Successfully marked environment as ready for task graph 'Q4\[task-graph-0\]@1'", + response.message, + ) + + time.sleep(3) + + # Get placements for the task + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-0", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "1234" and placement.application_id == "task-graph-0" + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {0, 1} + + # Wait for 3 seconds and trigger notify task completion for tasks 0 and 1 + time.sleep(3) + + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", task_id=0, timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", task_id=1, timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + # Wait for 20s to allow the service to execute task completion for fastest task + time.sleep(20) + + # Attempt to incorrectly notify task completion for task 3, which hasnt started yet + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", task_id=3, timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert not response.success + + # Wait 2s to allow the service to process the incorrect task completion + time.sleep(2) + + # Wait for 25s to allow the service to finish execution of task 0 + time.sleep(25) + + # This will unlock task 2, which should now be returned as a placement + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-0", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "1234" and placement.application_id == "task-graph-0" + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {2} + + # Attempt to register the second TaskGraph, wont be able to run due to inadequate resources + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph-1", + name="TPCH Query 4 50 200", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]}, + {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]}, + {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]}, + {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]}, + {"key": {"id": 4, "name": "stage 4"}, "children_ids": []}, + ], + ) + response = stub.RegisterTaskGraph(request) + assert ( + not response.success + and re.search( + r"The worker Pool cannot accomodate the task graph 'task-graph-1'", + response.message, + ) + and response.num_executors == 0 + ) + + # Register the third TaskGraph, will run but will get cancelled due to deadline miss + request = erdos_scheduler_pb2.RegisterTaskGraphRequest( + id="task-graph-2", + name="TPCH Query 4 50 50", + timestamp=1234567890, + dependencies=[ + {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]}, + {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]}, + {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]}, + {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]}, + {"key": {"id": 4, "name": "stage 4"}, "children_ids": []}, + ], + ) + response = stub.RegisterTaskGraph(request) + assert ( + response.success + and re.search( + r"Registered task graph 'task-graph-2' successfully", + response.message, + ) + and response.num_executors == 10 + ) + + # Introduce a 2s delay in getting the env ready + time.sleep(2) + + # Mark the environment as ready + request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest( + id="task-graph-2", + num_executors=10, + timestamp=1234567890, + ) + response = stub.RegisterEnvironmentReady(request) + assert response.success and re.search( + r"Successfully marked environment as ready for task graph 'Q4\[task-graph-2\]@1'", + response.message, + ) + + # Wait for 10s to get the placements for the second task graph + time.sleep(10) + + # Get placements for the taskgraph 3, one of first two root vertices should be placed since there are resources + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-2", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + for placement in response.placements: + assert ( + placement.worker_id == "1234" and placement.application_id == "task-graph-2" + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {1} + + # Wait for 100 more seconds and request placements again + time.sleep(100) + + # Notify task completion for task 2 in task graph 0 to trigger scheduler run again + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-0", task_id=2, timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + # Wait for 2 seconds to allow scheduler to process task completion and run scheduler + time.sleep(2) + + # Get placements for the task, entire taskgraph would be cancelled since deadline has passed. + # Since one root vertex (1) is running, the other root vertex (0) will be cancelled first, + # then the subsequent vertices. + # NOTE: The service will wait until all running/ scheduled tasks complete and are removed + # from the workerpool before issuing a terminate=True for the taskgraph. Until then it will + # return current placements for a taskgraph (including those already sent by the service) + # and wait for running tasks to finish. Spark will ignore it. + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-2", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + # Will return placement for task_id 1 + for placement in response.placements: + if placement.task_id == 1: + assert ( + placement.worker_id == "1234" + and placement.application_id == "task-graph-2" + ) + actual_task_ids.add(placement.task_id) + assert actual_task_ids == {1} + + # Wait for 5s to issue notify task completion for task_id 1 in task-graph-2 + time.sleep(5) + request = erdos_scheduler_pb2.NotifyTaskCompletionRequest( + application_id="task-graph-2", task_id=1, timestamp=1234567890 + ) + response = stub.NotifyTaskCompletion(request) + assert response.success + + # Wait for 5s to allow the simulator to process the event. + # Invoke get placements again for task-graph 2, it should return terminate=True now + time.sleep(5) + request = erdos_scheduler_pb2.GetPlacementsRequest( + timestamp=1234567890, + id="task-graph-2", + ) + response = stub.GetPlacements(request) + assert response.success + actual_task_ids = set() + # Will return placement for task_id 1 + for placement in response.placements: + actual_task_ids.add(placement.task_id) + assert len(actual_task_ids) == 0 + assert response.terminate == True + + # Deregister framework + request = erdos_scheduler_pb2.DeregisterFrameworkRequest( + name="test_framework", uri="http://localhost/test", timestamp=1234567890 + ) + response = stub.DeregisterFramework(request) + assert response.success and re.search( + r"Successfully deregistered the framework at http://localhost/test", + response.message, + ) + + channel.close() diff --git a/utils.py b/utils.py index 7e8f2814..1fa8adb0 100644 --- a/utils.py +++ b/utils.py @@ -93,24 +93,28 @@ def to_unchecked(self, unit: Unit) -> Tuple[float, Unit]: return self.time * self.unit.to(unit), unit def fuzz( - self, variance: Tuple[int, int], bounds: Tuple[int, int] = (0, sys.maxsize) + self, variance: Tuple[int, int], bounds: Tuple[int, int] = (0, sys.maxsize), rng: random.Random = None ) -> "EventTime": """Fuzz the time according to the provided `variance` and within the bounds. Args: variance (`Tuple[int, int]`): The (minimum, maximum) % variance to fuzz by. bounds (`Tuple[int, int]`): The (minimum, maximum) bounds to fuzz within. + rng (random.Random): The random number generator to use. Defaults to an internal RNG if none is specified. Returns: The fuzzed time according to the given variance. """ + if rng is None: + rng = type(self)._rng + min_variance, max_variance = variance min_bound, max_bound = bounds fuzzed_time = max( min_bound, min( max_bound, - type(self)._rng.uniform( + rng.uniform( self.time * abs(min_variance) / 100.0, self.time * abs(max_variance) / 100.0, ), diff --git a/workload/jobs.py b/workload/jobs.py index e0acb2e4..97b4966a 100644 --- a/workload/jobs.py +++ b/workload/jobs.py @@ -805,12 +805,23 @@ def _generate_task_graph( resolve_conditionals = False task_logger = setup_logging(name="Task") + # Create an RNG to be used when fuzzing deadlines, seeded by + # the TaskGraph name and the global random seed, if provided. + # This ensures that deadlines are deterministic, which is + # needed for simulator/Spark parity. + deadline_rng = random.Random( + (str(_flags.random_seed) if _flags else "") + task_graph_name + ) + # Generate the deadline for all the Tasks. # TODO (Sukrit): Right now, this assumes that all Tasks in the TaskGraph come # with the same deadline. At some point, we will have to implement a # heuristic-based deadline splitting technique. + + # NOTE: The taskgraph deadline is re-generated (and overwritten) after + # use_branch_predicated_deadlines code, since fuzz is invoked again there. task_deadline = release_time + self.completion_time.fuzz( - deadline_variance, deadline_bounds + deadline_variance, deadline_bounds, rng=deadline_rng ) # Generate all the `Task`s from the `Job`s in the graph. @@ -883,8 +894,10 @@ def _generate_task_graph( else: weighted_task_graph_length = self.__get_completion_time() + # NOTE: This is the second time the deadline is being set, based on a second + # invocation of fuzz. task_graph_deadline = release_time + weighted_task_graph_length.fuzz( - deadline_variance, deadline_bounds + deadline_variance, deadline_bounds, rng=deadline_rng ) if _flags and _flags.decompose_deadlines: stages_info = {} diff --git a/workload/tasks.py b/workload/tasks.py index 48691ae7..c929f696 100644 --- a/workload/tasks.py +++ b/workload/tasks.py @@ -53,7 +53,7 @@ class Task(object): Args: name (`str`): The name of the computation (typically the callback of the ERDOS operator. - task_graph_name (`str`): The name of the TaskGraph that this Task belongs to. + task_graph (`str`): The name of the TaskGraph that this Task belongs to. job (`Job`): The job that created this particular task. deadline (`EventTime`): The absolute deadline by which the task should complete. profile (`WorkProfile`): A profile of the computation that the Task is supposed @@ -224,6 +224,7 @@ def schedule( self._state = TaskState.SCHEDULED self._scheduling_time = time self._scheduler_placement = placement + self._start_time = placement.placement_time self._worker_pool_id = placement.worker_pool_id self.update_remaining_time(placement.execution_strategy.runtime)