diff --git a/analyze.py b/analyze.py
index a2219981..97933fbd 100644
--- a/analyze.py
+++ b/analyze.py
@@ -335,7 +335,7 @@ def analyze_resource_utilization(
     # Plotting defaults.
     # hatches = ['//', '--', '**']
     # alphas = np.arange(0.2, 1.2, 0.2)
-    resource_color = {"GPU": "red", "CPU": "green"}
+    resource_color = {"Slot": "green"}
 
     # Worker Pool statistics
     worker_pool_stats = csv_reader.get_worker_pool_utilizations(scheduler_csv_file)
@@ -1246,16 +1246,16 @@ def log_aggregate_stats(
                 / sum(stat.resource_utilizations[resource])
                 for stat in worker_pool_stats
             ]
-            for resource in ("GPU", "CPU")
+            for resource in ("Slot",)
         }
 
         scheduler_invocations = csv_reader.get_scheduler_invocations(csv_file)
         placed_tasks = [
-            scheduler_invocation.placed_tasks
+            scheduler_invocation.num_placed_tasks
             for scheduler_invocation in scheduler_invocations
         ]
         unplaced_tasks = [
-            scheduler_invocation.unplaced_tasks
+            scheduler_invocation.num_unplaced_tasks
             for scheduler_invocation in scheduler_invocations
         ]
 
@@ -1268,8 +1268,7 @@ def log_aggregate_stats(
                 placement_delay,
                 deadline_delay,
                 stat_function(e2e_response_time),
-                stat_function(resource_uses["GPU"]),
-                stat_function(resource_uses["CPU"]),
+                stat_function(resource_uses["Slot"]),
                 stat_function(placed_tasks),
                 stat_function(unplaced_tasks),
                 log_name,
@@ -1288,8 +1287,7 @@ def log_aggregate_stats(
                 "Placement",
                 "Deadline",
                 "JCT",
-                "GPU",
-                "CPU",
+                "Slot",
                 "Placed",
                 "Unplaced",
                 "Log",
diff --git a/configs/tpch_replay_dsched.conf b/configs/tpch_replay_dsched.conf
new file mode 100644
index 00000000..1b839546
--- /dev/null
+++ b/configs/tpch_replay_dsched.conf
@@ -0,0 +1,38 @@
+# Output configs.
+--log=./tpch_replay_dsched.log
+--log_level=debug
+--csv=./tpch_replay_dsched.csv
+
+# Task configs.
+--runtime_variance=0
+
+# Scheduler configs.
+
+# DSched
+--scheduler=TetriSched
+--scheduler_runtime=0
+--enforce_deadlines
+--retract_schedules
+--release_taskgraphs
+--drop_skipped_tasks
+--scheduler_time_discretization=1
+
+# Deadline variance
+--min_deadline_variance=10
+--max_deadline_variance=25
+
+# Execution mode configs.
+--execution_mode=replay
+--replay_trace=tpch
+
+# Release time config.
+--override_release_policy=gamma
+--override_gamma_coefficient=1
+--override_poisson_arrival_rate=1
+--override_num_invocation=10
+
+# TPCH flags
+--random_seed=1234
+--tpch_query_dag_spec=profiles/workload/tpch/queries.yaml
+--tpch_dataset_size=50
+--worker_profile_path=profiles/workers/tpch_cluster.yaml
diff --git a/configs/tpch_replay_edf.conf b/configs/tpch_replay_edf.conf
new file mode 100644
index 00000000..cf23650a
--- /dev/null
+++ b/configs/tpch_replay_edf.conf
@@ -0,0 +1,47 @@
+# Output configs.
+# --log=./tpch_replay_dsched.log
+# --log_level=debug
+# --csv=./tpch_replay_dsched.csv
+
+--log=./tpch_replay_edf.log
+--log_level=debug
+--csv=./tpch_replay_edf.csv
+
+# Task configs.
+--runtime_variance=0
+
+# Scheduler configs.
+
+# EDF
+--scheduler=EDF
+--scheduler_runtime=0
+--enforce_deadlines
+
+# DSched
+# --scheduler=TetriSched
+# --scheduler_runtime=0
+# --enforce_deadlines
+# --retract_schedules
+# --release_taskgraphs
+# --drop_skipped_tasks
+# --scheduler_time_discretization=1
+
+# Deadline variance
+--min_deadline_variance=10
+--max_deadline_variance=25
+
+# Execution mode configs.
+--execution_mode=replay
+--replay_trace=tpch
+
+# Release time config.
+--override_release_policy=gamma
+--override_gamma_coefficient=1
+--override_poisson_arrival_rate=1
+--override_num_invocation=10
+
+# TPCH flags
+--random_seed=1234
+--tpch_query_dag_spec=profiles/workload/tpch/queries.yaml
+--tpch_dataset_size=50
+--worker_profile_path=profiles/workers/tpch_cluster.yaml
diff --git a/data/__init__.py b/data/__init__.py
index ec2c2986..9db1ee5b 100644
--- a/data/__init__.py
+++ b/data/__init__.py
@@ -7,6 +7,7 @@
 from .task_loader_benchmark import TaskLoaderBenchmark
 from .task_loader_pylot import TaskLoaderPylot
 from .task_loader_synthetic import TaskLoaderSynthetic
+from .tpch_loader import TpchWorkloadLoader
 from .worker_loader import WorkerLoader
 from .worker_loader_benchmark import WorkerLoaderBenchmark
 from .workload_loader import WorkloadLoader
diff --git a/data/csv_reader.py b/data/csv_reader.py
index d4d0d1f4..b81e0767 100644
--- a/data/csv_reader.py
+++ b/data/csv_reader.py
@@ -63,6 +63,11 @@ def parse_events(self, readings: Mapping[str, Sequence[str]]):
                         )
                     elif reading[1] == "UPDATE_WORKLOAD":
                         simulator.total_tasks += int(reading[2])
+                    elif reading[1] == "LOG_STATS":
+                        assert (
+                            simulator is not None
+                        ), "No SIMULATOR_START found for a corresponding SIMULATOR_END."
+                        simulator.update_stats(reading)
                     elif reading[1] == "SIMULATOR_END":
                         assert (
                             simulator is not None
diff --git a/data/csv_types.py b/data/csv_types.py
index 390cd0a6..851299f8 100644
--- a/data/csv_types.py
+++ b/data/csv_types.py
@@ -385,6 +385,18 @@ def __init__(self, csv_path: str, start_time: int, total_tasks: int = 0):
         self.scheduler_invocations: list[Scheduler] = []
         self.task_graphs: dict[str, TaskGraph] = {}
 
+    def update_stats(self, csv_reading: str):
+        assert (
+            csv_reading[1] == "LOG_STATS"
+        ), f"The event {csv_reading[1]} was not of type LOG_STATS."
+        self.finished_tasks = int(csv_reading[2])
+        self.dropped_tasks = int(csv_reading[3])
+        self.missed_deadlines = int(csv_reading[4])
+        self.finished_task_graphs = int(csv_reading[5])
+        self.dropped_taskgraphs = int(csv_reading[6])
+        self.missed_taskgraphs = int(csv_reading[7])
+        self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs
+
     def update_finish(self, csv_reading: str):
         """Updates the values of the Simulator based on the SIMULATOR_END event from
         CSV.
@@ -396,10 +408,10 @@ def update_finish(self, csv_reading: str):
             csv_reading[1] == "SIMULATOR_END"
         ), f"The event {csv_reading[1]} was not of type SIMULATOR_END."
         self.end_time = int(csv_reading[0])
-        self.finished_tasks = int(csv_reading[2])
-        self.dropped_tasks = int(csv_reading[3])
-        self.missed_deadlines = int(csv_reading[4])
-        self.finished_task_graphs = int(csv_reading[5])
-        self.dropped_taskgraphs = int(csv_reading[6])
-        self.missed_taskgraphs = int(csv_reading[7])
-        self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs
+        # self.finished_tasks = int(csv_reading[2])
+        # self.dropped_tasks = int(csv_reading[3])
+        # self.missed_deadlines = int(csv_reading[4])
+        # self.finished_task_graphs = int(csv_reading[5])
+        # self.dropped_taskgraphs = int(csv_reading[6])
+        # self.missed_taskgraphs = int(csv_reading[7])
+        # self.goodput_taskgraphs = self.finished_task_graphs - self.missed_taskgraphs
diff --git a/data/tpch_loader.py b/data/tpch_loader.py
new file mode 100644
index 00000000..57ea6816
--- /dev/null
+++ b/data/tpch_loader.py
@@ -0,0 +1,600 @@
+import os
+import math
+import json
+import sys
+import random
+
+from typing import Any, Dict, List, Optional, Callable, Tuple
+from pathlib import Path
+from enum import Enum
+
+import absl
+import numpy as np
+import yaml
+import networkx as nx
+
+from utils import EventTime, setup_logging
+from workload import (
+    Workload,
+    WorkProfile,
+    TaskGraph,
+    Job,
+    JobGraph,
+    ExecutionStrategy,
+    ExecutionStrategies,
+    Resource,
+    Resources,
+)
+
+from .base_workload_loader import BaseWorkloadLoader
+
+
+class TpchQueryDifficulty(Enum):
+    easy = {1, 3, 4, 6, 12, 14, 17, 19, 22}
+    medium = {10, 11, 13, 15, 16, 18, 20}
+    hard = {2, 7, 8, 9, 21}
+
+
+class TpchLoader:
+    """Construct TPC-H task graph from a query profile
+
+    Args:
+        path (`str`): Path to a YAML file specifying the TPC-H query DAGs
+        flags (`absl.flags`): The flags used to initialize the app, if any
+
+    """
+
+    def __init__(
+        self,
+        path: Path,
+        flags: "absl.flags",
+    ):
+        self._logger = setup_logging(
+            name=self.__class__.__name__,
+            log_dir=flags.log_dir,
+            log_file=flags.log_file_name,
+            log_level=flags.log_level,
+        )
+        self._flags = flags
+
+        # Load the TPC-H DAG structures
+        with open(path, "r") as f:
+            workload_data = yaml.safe_load(f)
+        self._graphs = {}
+        for query in workload_data["graphs"]:
+            query_num = int(query["name"][1:])
+            self._graphs[query_num] = query["graph"]
+
+    def make_job_graph(
+        self,
+        id: str,
+        query_num: int,
+        dependencies: Optional[List[Dict[str, Any]]] = None,
+        profile_type: Optional[str] = None,
+        dataset_size: Optional[int] = None,
+        max_executors_per_job: Optional[int] = None,
+        min_task_runtime: Optional[int] = None,
+        runtime_unit: EventTime.Unit = EventTime.Unit.US,
+    ) -> Tuple[TaskGraph, Dict[int, int]]:
+        if profile_type is None:
+            profile_type = self._flags.tpch_profile_type
+        if dataset_size is None:
+            dataset_size = self._flags.tpch_dataset_size
+        if max_executors_per_job is None:
+            max_executors_per_job = self._flags.tpch_max_executors_per_job
+        if min_task_runtime is None:
+            min_task_runtime = self._flags.tpch_min_task_runtime
+
+        # Normalize dependencies
+        if dependencies is None:
+            dependencies = self._graphs[query_num]
+            deps_mapping = None
+        else:
+            deps_mapping = self.__map_dependencies(query_num, dependencies)
+            for node in dependencies:
+                node["name"] = deps_mapping[node["name"]]
+                if "children" in node:
+                    node["children"] = [deps_mapping[c] for c in node["children"]]
+            self._logger.info(
+                f"Mapped dependencies for TPC-H query {query_name(query_num)} as {deps_mapping}."
+            )
+
+        # Construct a JobGraph
+        job_graph = JobGraph(name=task_graph_name(query_num, id))
+        profiler_data = get_all_stage_info_for_query(
+            query_num,
+            profile_type,
+            dataset_size,
+            max_executors_per_job,
+        )
+        name_to_job = {}
+        for node in dependencies:
+            worker_profile = self.__make_work_profile(
+                profiler_data=profiler_data,
+                query_num=query_num,
+                node_name=node["name"],
+                max_executors_per_job=max_executors_per_job,
+                min_task_runtime=min_task_runtime,
+                runtime_unit=runtime_unit,
+            )
+            job = Job(
+                name=node["name"],
+                profile=worker_profile,
+            )
+            name_to_job[node["name"]] = job
+            job_graph.add_job(job=job)
+        for node in dependencies:
+            job = name_to_job[node["name"]]
+            if "children" in node:
+                for child in node["children"]:
+                    if child not in name_to_job:
+                        raise ValueError(
+                            f"Child {child} of {node['name']} was "
+                            f"not present in the graph."
+                        )
+                    child_job = name_to_job[child]
+                    job_graph.add_child(job, child_job)
+
+        self._logger.info(
+            f"Constructed JobGraph for TPC-H query {query_name(query_num)}."
+        )
+
+        return job_graph, deps_mapping
+
+    def __make_work_profile(
+        self,
+        profiler_data: Dict[int, Dict[str, Any]],
+        query_num: int,
+        node_name: str,
+        max_executors_per_job: int,
+        min_task_runtime: int,
+        runtime_unit: EventTime,
+    ) -> WorkProfile:
+        profile = profiler_data[int(node_name)]
+
+        profiled_task_slots = profile["num_tasks"]
+        profiled_runtime = math.ceil(profile["avg_task_duration_ms"] / 1e3)
+
+        if profiled_task_slots > max_executors_per_job:
+            num_slots = max_executors_per_job
+            runtime = math.ceil(
+                (profiled_task_slots * profiled_runtime) / max_executors_per_job
+            )
+            self._logger.debug(
+                "%s@%s: num_slots (%s) > max_executors_per_job (%s). Converted "
+                "(slots,runtime) from (%s,%s) to (%s, %s)",
+                node_name,
+                query_name(query_num),
+                profiled_task_slots,
+                max_executors_per_job,
+                profiled_task_slots,
+                profiled_runtime,
+                num_slots,
+                runtime,
+            )
+        else:
+            num_slots = profiled_task_slots
+            runtime = profiled_runtime
+
+        if runtime < min_task_runtime:
+            _runtime = runtime
+            runtime = max(min_task_runtime, _runtime)
+            self._logger.debug(
+                "%s@%s: runtime (%s) < min_task_runtime (%s). Converted "
+                "(slots,runtime) from (%s,%s) to (%s, %s)",
+                node_name,
+                query_name(query_num),
+                _runtime,
+                min_task_runtime,
+                num_slots,
+                _runtime,
+                num_slots,
+                runtime,
+            )
+
+        resources = Resources(
+            resource_vector={
+                Resource(name="Slot", _id="any"): num_slots,
+            },
+        )
+        execution_strategies = ExecutionStrategies()
+        execution_strategies.add_strategy(
+            strategy=ExecutionStrategy(
+                resources=resources,
+                batch_size=1,
+                runtime=EventTime(runtime, runtime_unit),
+            ),
+        )
+        return WorkProfile(
+            name=f"{query_name(query_num)}_{node_name}_execution_profile",
+            execution_strategies=execution_strategies,
+        )
+
+    def __map_dependencies(self, query_num: int, deps: List[Dict[str, Any]]):
+        def deps_to_nx_graph(deps: List[Dict[str, Any]]):
+            query_dependency = []
+            for node in deps:
+                if "children" in node:
+                    for child in node["children"]:
+                        query_dependency.append((node["name"], child))
+                else:
+                    # Ensure each tuple has two elements by adding a dummy node
+                    query_dependency.append((node["name"], None))
+
+            # Remove any tuples where the second element is None
+            query_dependency = [
+                edge for edge in query_dependency if edge[1] is not None
+            ]
+
+            # convert job structure into a nx graph
+            nx_deps = nx.DiGraph(query_dependency)
+
+            return nx_deps
+
+        def are_structurally_same(graph1, graph2):
+            # Step 1: Check if both graphs have the same number of vertices
+            if len(graph1.nodes) != len(graph2.nodes):
+                return False, None
+
+            # Step 2: Check if there exists a bijection between the vertices
+            #         of the two graphs such that their adjacency relationships match
+            for mapping in nx.isomorphism.GraphMatcher(
+                graph1, graph2
+            ).isomorphisms_iter():
+                # Check if the adjacency relationships match
+                if all(v in mapping for u, v in graph1.edges):
+                    # graph structures match
+                    # mapping is a dict {key=original-stage-id, val=app-stage-id}
+                    # we reverse reversed mapping from app-stage-id to orig-stage-id
+                    reversed_mapping = {v: k for k, v in mapping.items()}
+                    return True, reversed_mapping
+
+            return False, None
+
+        base_deps = self._graphs[query_num]
+        is_same, mapping = are_structurally_same(
+            deps_to_nx_graph(base_deps), deps_to_nx_graph(deps)
+        )
+
+        if not is_same:
+            raise ValueError(
+                f"Structure of dependencies provided for query number {query_num} does not match that of canonical dependencies. Provided: {deps}. Canonical: {base_deps}"
+            )
+
+        return mapping
+
+    @property
+    def num_queries(self) -> int:
+        return len(self._graphs)
+
+
+class TpchWorkloadLoader(BaseWorkloadLoader):
+    """Construct a TPC-H query workload
+
+    Args:
+        flags (`absl.flags`): The flags used to initialize the app, if any
+    """
+
+    def __init__(self, flags: "absl.flags") -> None:
+        self._flags = flags
+        self._logger = setup_logging(
+            name=self.__class__.__name__,
+            log_dir=flags.log_dir,
+            log_file=flags.log_file_name,
+            log_level=flags.log_level,
+        )
+        self._rng_seed = flags.random_seed
+        self._rng = random.Random(self._rng_seed)
+        if flags.workload_update_interval > 0:
+            self._workload_update_interval = flags.workload_update_interval
+        else:
+            self._workload_update_interval = EventTime(sys.maxsize, EventTime.Unit.US)
+
+        # Instantiate tpch loader
+        self._tpch_loader = TpchLoader(path=flags.tpch_query_dag_spec, flags=flags)
+
+        # Intialize [(query_num, release_time)]
+        self._query_nums_and_release_times = []
+        if len(flags.override_num_invocations) > 0:
+            # One each for easy, medium, and hard
+            assert len(flags.override_num_invocations) == len(TpchQueryDifficulty)
+            assert len(flags.override_poisson_arrival_rates) == len(
+                flags.override_num_invocations
+            )
+
+            # only works with poisson distribution
+            assert flags.override_release_policy == "poisson"
+
+            for i, part in enumerate(TpchQueryDifficulty):
+                release_policy = self.__make_release_policy(
+                    policy_type=flags.override_release_policy,
+                    arrival_rate=float(flags.override_poisson_arrival_rates[i]),
+                    num_invocations=int(flags.override_num_invocations[i]),
+                )
+                release_times = release_policy.get_release_times(
+                    completion_time=EventTime(
+                        self._flags.loop_timeout, EventTime.Unit.US
+                    )
+                )
+                query_nums = [
+                    self._rng.choice(list(part.value))
+                    for _ in range(int(flags.override_num_invocations[i]))
+                ]
+                self._query_nums_and_release_times.extend(
+                    list(zip(query_nums, release_times))
+                )
+
+            self._query_nums_and_release_times.sort(key=lambda x: x[1])
+        else:
+            release_policy = self.__make_release_policy()
+            release_times = release_policy.get_release_times(
+                completion_time=EventTime(self._flags.loop_timeout, EventTime.Unit.US)
+            )
+            query_nums = [
+                self._rng.randint(1, self._tpch_loader.num_queries)
+                for _ in range(self._flags.override_num_invocation)
+            ]
+            self._query_nums_and_release_times.extend(
+                list(zip(query_nums, release_times))
+            )
+
+        self._current_release_pointer = 0
+
+        # Initialize workload
+        self._workload = Workload.empty(flags)
+
+    def __make_release_policy(
+        self, policy_type=None, arrival_rate=None, num_invocations=None
+    ):
+        if policy_type is None:
+            policy_type = self._flags.override_release_policy
+        if arrival_rate is None:
+            arrival_rate = self._flags.override_poisson_arrival_rate
+        if num_invocations is None:
+            num_invocations = self._flags.override_num_invocation
+
+        release_policy_args = {}
+        if policy_type == "periodic":
+            release_policy_args = {
+                "period": EventTime(
+                    self._flags.override_arrival_period, EventTime.Unit.US
+                ),
+            }
+        elif policy_type == "fixed":
+            release_policy_args = {
+                "period": EventTime(
+                    self._flags.override_arrival_period, EventTime.Unit.US
+                ),
+                "num_invocations": num_invocations,
+            }
+        elif policy_type == "poisson":
+            release_policy_args = {
+                "rate": arrival_rate,
+                "num_invocations": num_invocations,
+            }
+        elif policy_type == "gamma":
+            release_policy_args = {
+                "rate": arrival_rate,
+                "num_invocations": num_invocations,
+                "coefficient": self._flags.override_gamma_coefficient,
+            }
+        elif policy_type == "fixed_gamma":
+            release_policy_args = {
+                "variable_arrival_rate": arrival_rate,
+                "base_arrival_rate": self._flags.override_base_arrival_rate,
+                "num_invocations": num_invocations,
+                "coefficient": self._flags.override_gamma_coefficient,
+            }
+        else:
+            raise NotImplementedError(f"Release policy {policy_type} not implemented.")
+
+        return make_release_policy(
+            policy_type,
+            release_policy_args,
+            self._rng,
+            self._rng_seed,
+            (
+                self._flags.randomize_start_time_min,
+                self._flags.randomize_start_time_max,
+            ),
+        )
+
+    def get_next_workload(self, current_time: EventTime) -> Optional[Workload]:
+        # Reset rng if this is the first workload. This is to ensure we have
+        # parity with how jobs are spawned in Spark
+        if self._current_release_pointer == 0:
+            self._rng = random.Random(self._rng_seed)
+
+        to_release = []
+        while (
+            self._current_release_pointer < len(self._query_nums_and_release_times)
+            and self._query_nums_and_release_times[self._current_release_pointer][1]
+            <= current_time + self._workload_update_interval
+        ):
+            to_release.append(
+                self._query_nums_and_release_times[self._current_release_pointer]
+            )
+            self._current_release_pointer += 1
+
+        if (
+            self._current_release_pointer >= len(self._query_nums_and_release_times)
+            and len(to_release) == 0
+        ):
+            # Nothing left to release
+            return None
+
+        for i, (q, t) in enumerate(to_release):
+            job_graph, _ = self._tpch_loader.make_job_graph(
+                id=str(i),
+                query_num=q,
+            )
+            task_graph = job_graph.get_next_task_graph(
+                start_time=t,
+                _flags=self._flags,
+            )
+            self._workload.add_task_graph(task_graph)
+
+        return self._workload
+
+
+def query_name(query_num: int) -> str:
+    return f"Q{query_num}"
+
+
+def task_graph_name(query_num: int, id: any) -> str:
+    return f"{query_name(query_num)}[{id}]"
+
+
+def make_release_policy(
+    release_policy, release_policy_args, rng, seed, randomize_start_time=(0, 0)
+):
+    # Check that none of the arg values are None
+    assert all([val is not None for val in release_policy_args.values()])
+
+    # Construct the release policy
+    start_time = EventTime(
+        time=rng.randint(*randomize_start_time),
+        unit=EventTime.Unit.US,
+    )
+    release_policy = getattr(JobGraph.ReleasePolicy, release_policy)(
+        start=start_time, rng_seed=seed, **release_policy_args
+    )
+
+    return release_policy
+
+
+# TODO: make configurable
+TPCH_SUBDIR = "100g/"
+DECIMA_TPCH_DIR = (
+    Path(__file__).resolve().parent / ".." / "profiles/workload/tpch/decima/"
+)
+CLOUDLAB_TPCH_DIR = (
+    Path(__file__).resolve().parent / ".." / "profiles/workload/tpch/cloudlab/"
+)
+
+
+class SetWithCount(object):
+    """
+    allow duplication in set
+    """
+
+    def __init__(self):
+        self.set = {}
+
+    def __contains__(self, item):
+        return item in self.set
+
+    def add(self, item):
+        if item in self.set:
+            self.set[item] += 1
+        else:
+            self.set[item] = 1
+
+    def clear(self):
+        self.set.clear()
+
+    def remove(self, item):
+        self.set[item] -= 1
+        if self.set[item] == 0:
+            del self.set[item]
+
+
+def pre_process_task_duration(task_duration):
+    # remove fresh durations from first wave
+    clean_first_wave = {}
+    for e in task_duration["first_wave"]:
+        clean_first_wave[e] = []
+        fresh_durations = SetWithCount()
+        # O(1) access
+        for d in task_duration["fresh_durations"][e]:
+            fresh_durations.add(d)
+        for d in task_duration["first_wave"][e]:
+            if d not in fresh_durations:
+                clean_first_wave[e].append(d)
+            else:
+                # prevent duplicated fresh duration blocking first wave
+                fresh_durations.remove(d)
+
+
+def get_all_stage_info_for_query(query_num, profile_type, dataset_size, max_executors):
+    stage_info = {}
+    if profile_type == "Decima":
+        stage_info = use_decima_tpch_profile(query_num, dataset_size)
+    elif profile_type == "Cloudlab":
+        stage_info = use_cloudlab_profile(query_num, dataset_size, max_executors)
+    else:
+        raise ValueError(f"Invalid profile type: {profile_type}")
+
+    return stage_info
+
+
+def use_cloudlab_profile(query_num, dataset_size, max_executors):
+    cloudlab_profile_json = os.path.join(
+        CLOUDLAB_TPCH_DIR, "cloudlab_22query_tpch_profiles.json"
+    )
+    with open(cloudlab_profile_json, "r") as file:
+        data = json.load(file)
+
+    query_key_to_extract = (
+        "tpch_q"
+        + str(query_num)
+        + "_"
+        + str(dataset_size)
+        + "g"
+        + "_maxCores_"
+        + str(max_executors)
+    )
+    required_query_profile = data[query_key_to_extract]
+
+    stage_info = {}
+
+    for i, stage_profile in enumerate(required_query_profile):
+        curr_stage = {
+            "stage_id": i,
+            "num_tasks": stage_profile["num_tasks"],
+            "avg_task_duration_ms": round(stage_profile["average_runtime_ms"]),
+        }
+        stage_info[i] = curr_stage
+
+    return stage_info
+
+
+def use_decima_tpch_profile(query_num, dataset_size):
+    task_durations = np.load(
+        os.path.join(
+            DECIMA_TPCH_DIR, dataset_size, "task_duration_" + str(query_num) + ".npy"
+        ),
+        allow_pickle=True,
+    ).item()
+
+    num_nodes = len(task_durations)
+
+    stage_info = {}
+
+    for n in range(num_nodes):
+        task_duration = task_durations[n]
+        e = next(iter(task_duration["first_wave"]))
+        # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]}
+
+        num_tasks = len(task_duration["first_wave"][e]) + len(
+            task_duration["rest_wave"][e]
+        )
+
+        # remove fresh duration from first wave duration
+        # drag nearest neighbor first wave duration to empty spots
+        pre_process_task_duration(task_duration)
+        rough_duration = np.mean(
+            [i for t in task_duration["first_wave"].values() for i in t]
+            + [i for t in task_duration["rest_wave"].values() for i in t]
+            + [i for t in task_duration["fresh_durations"].values() for i in t]
+        )
+
+        # NOTE: Runtime per task is given in milliseconds
+        curr_stage = {
+            "stage_id": n,
+            "num_tasks": num_tasks,
+            "avg_task_duration_ms": round(rough_duration),
+        }
+        stage_info[n] = curr_stage
+
+    return stage_info
diff --git a/data/tpch_partitioning_analysis.ipynb b/data/tpch_partitioning_analysis.ipynb
new file mode 100644
index 00000000..6b0eb160
--- /dev/null
+++ b/data/tpch_partitioning_analysis.ipynb
@@ -0,0 +1,315 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_json(file_path):\n",
+    "    \"\"\"Load JSON data from a file.\"\"\"\n",
+    "    with open(file_path, \"r\") as f:\n",
+    "        return json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def recompute_tasks_runtime(stage_id, num_tasks, avg_runtime_ms, max_executors, min_task_runtime_ms):\n",
+    "    \"\"\"Recompute the number of tasks and runtime per task while enforcing constraints.\"\"\"\n",
+    "    profiled_runtime = math.ceil(avg_runtime_ms / 1000)  # Convert ms to seconds\n",
+    "\n",
+    "    if num_tasks > max_executors:\n",
+    "        adjusted_runtime = math.ceil((num_tasks * profiled_runtime) / max_executors)\n",
+    "        adjusted_num_tasks = max_executors\n",
+    "    else:\n",
+    "        adjusted_runtime = profiled_runtime\n",
+    "        adjusted_num_tasks = num_tasks\n",
+    "\n",
+    "    final_runtime = max(math.ceil(min_task_runtime_ms / 1000), adjusted_runtime)  # Enforce min runtime\n",
+    "\n",
+    "    # print(\n",
+    "    #     f\"Stage {stage_id}: num_tasks ({num_tasks}) -> {adjusted_num_tasks}, \"\n",
+    "    #     f\"runtime_s ({profiled_runtime}) -> {final_runtime}\"\n",
+    "    # )\n",
+    "\n",
+    "    return adjusted_num_tasks, final_runtime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_tpch_data(json_data, dataset_size, max_executors, min_task_runtime_ms):\n",
+    "    \"\"\"Extract TPCH query data, filter relevant queries, and recompute runtime.\"\"\"\n",
+    "    extracted_data = {}\n",
+    "\n",
+    "    for query_key, stages in json_data.items():\n",
+    "        if dataset_size in query_key and \"maxCores_\" + str(max_executors) in query_key:\n",
+    "            query_id = query_key.split(\"_\")[1]  # Extract query number (e.g., 'q1')\n",
+    "            # print(f\"--------Processing query {query_id}\")\n",
+    "            extracted_data[query_id] = [\n",
+    "                (stage[\"stage_id\"], recompute_tasks_runtime(stage[\"stage_id\"], stage[\"num_tasks\"], int(stage[\"average_runtime_ms\"]), max_executors, min_task_runtime_ms))\n",
+    "                for stage in stages\n",
+    "            ]\n",
+    "\n",
+    "    return extracted_data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compute_resource_space(data):\n",
+    "    \"\"\"Compute the total resource space required for each query.\"\"\"\n",
+    "    return {query_id: sum(num_tasks * runtime for _, (num_tasks, runtime) in stages) for query_id, stages in data.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bucketize_queries(resource_space, bucket_size):\n",
+    "    \"\"\"Classify queries into easy, medium, and hard buckets based on resource consumption.\"\"\"\n",
+    "    buckets = {\"easy\": [], \"medium\": [], \"hard\": []}\n",
+    "    for query_id, value in resource_space.items():\n",
+    "        if value < bucket_size:\n",
+    "            buckets[\"easy\"].append(query_id)\n",
+    "        elif bucket_size <= value <= 2 * bucket_size:\n",
+    "            buckets[\"medium\"].append(query_id)\n",
+    "        else:\n",
+    "            buckets[\"hard\"].append(query_id)\n",
+    "    return buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analyze_tpch_queries(json_path, bucket_size, dataset_size, max_executors, min_task_runtime_ms):\n",
+    "    \"\"\"Main function to process TPCH queries and return categorized buckets.\"\"\"\n",
+    "    json_data = load_json(json_path)\n",
+    "    extracted_data = extract_tpch_data(json_data, dataset_size, max_executors, min_task_runtime_ms)\n",
+    "    resource_requirements = compute_resource_space(extracted_data)\n",
+    "    return bucketize_queries(resource_requirements, bucket_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Input params to create buckets\n",
+    "json_path = \"/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/cloudlab/cloudlab_22query_tpch_profiles.json\"\n",
+    "bucket_size=8000\n",
+    "dataset_size=\"100g\"\n",
+    "max_executors=200\n",
+    "min_task_runtime_ms=12000\n",
+    "\n",
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=bucket_size, dataset_size=dataset_size, max_executors=max_executors, min_task_runtime_ms=min_task_runtime_ms)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 100g dataset, varying the executors: 75, 100, 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q11', 'q13', 'q14', 'q15', 'q19', 'q20', 'q22'],\n",
+       " 'medium': ['q1', 'q2', 'q4', 'q6', 'q10', 'q12', 'q16', 'q17', 'q18'],\n",
+       " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q21']}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=5500, dataset_size=\"100g\", max_executors=75, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n",
+       " 'medium': ['q1', 'q4', 'q6', 'q10', 'q12', 'q14', 'q15', 'q17', 'q20'],\n",
+       " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q18', 'q21']}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=8000, dataset_size=\"100g\", max_executors=100, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q6', 'q11', 'q13', 'q19', 'q22'],\n",
+       " 'medium': ['q1', 'q2', 'q4', 'q10', 'q12', 'q14', 'q15', 'q16', 'q20'],\n",
+       " 'hard': ['q3', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=10000, dataset_size=\"100g\", max_executors=200, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### 250g dataset, varying the executors: 75, 100, 250"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n",
+       " 'medium': ['q1', 'q6', 'q7', 'q10', 'q12', 'q14', 'q15', 'q20'],\n",
+       " 'hard': ['q3', 'q4', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=11500, dataset_size=\"250g\", max_executors=75, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q2', 'q11', 'q13', 'q16', 'q19', 'q22'],\n",
+       " 'medium': ['q1', 'q6', 'q10', 'q12', 'q14', 'q15', 'q20'],\n",
+       " 'hard': ['q3', 'q4', 'q5', 'q7', 'q8', 'q9', 'q17', 'q18', 'q21']}"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=100, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'easy': ['q1', 'q2', 'q6', 'q11', 'q13', 'q16', 'q22'],\n",
+       " 'medium': ['q4', 'q7', 'q10', 'q12', 'q14', 'q15', 'q19', 'q20'],\n",
+       " 'hard': ['q3', 'q5', 'q8', 'q9', 'q17', 'q18', 'q21']}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "buckets = analyze_tpch_queries(json_path=json_path, bucket_size=15000, dataset_size=\"250g\", max_executors=200, min_task_runtime_ms=12000)\n",
+    "buckets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dg_erdos",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/main.py b/main.py
index b2df225a..b77e0b6d 100644
--- a/main.py
+++ b/main.py
@@ -9,6 +9,7 @@
     TaskLoaderBenchmark,
     TaskLoaderPylot,
     TaskLoaderSynthetic,
+    TpchWorkloadLoader,
     WorkerLoader,
     WorkerLoaderBenchmark,
     WorkloadLoader,
@@ -34,7 +35,7 @@
 flags.DEFINE_enum(
     "replay_trace",
     "pylot",
-    ["pylot", "clockwork_bursty", "alibaba"],
+    ["pylot", "clockwork_bursty", "alibaba", "tpch"],
     "Sets the trace to replay in the replay mode.",
 )
 flags.DEFINE_string(
@@ -115,6 +116,17 @@
     "If set to default (-1), then the Simulator will automatically choose an interval "
     "based on the set of released tasks in the previous iteration.",
 )
+flags.DEFINE_bool(
+    "orchestrated",
+    False,
+    "Runs the simulator in orchestrated mode. Currently used by the ERDOS service.",
+)
+flags.DEFINE_integer(
+    "min_placement_push_duration",
+    1,
+    "The duration (in µs) by which to push a task placement if it cannot be"
+    "placed on a worker at its original time",
+)
 
 # Benchmark related flags.
 flags.DEFINE_integer(
@@ -130,6 +142,40 @@
     "benchmark_num_cpus", 10, "Number of CPUs available for benchmarking."
 )
 
+# TPCH related flags
+flags.DEFINE_string(
+    "tpch_query_dag_spec",
+    "./profiles/workload/tpch/queries.yaml",
+    "Path to a YAML file specifying the TPC-H query DAGs",
+)
+flags.DEFINE_integer(
+    "tpch_num_queries",
+    50,
+    "Number of TPC-H queries to run",
+)
+flags.DEFINE_enum(
+    "tpch_profile_type",
+    "Cloudlab",
+    ["Cloudlab", "Decima"],
+    "Type of TPC-H profile the data loader must use",
+)
+flags.DEFINE_enum(
+    "tpch_dataset_size",
+    "50",
+    ["2", "50", "100", "250", "500"],
+    "Size of the TPC-H dataset to use",
+)
+flags.DEFINE_integer(
+    "tpch_max_executors_per_job",
+    50,
+    "Maximum number of executors to use per TPC-H query stage",
+)
+flags.DEFINE_integer(
+    "tpch_min_task_runtime",
+    8,
+    "Minimum runtime of a TPC-H task",
+)
+
 # AlibabaLoader related flags.
 flags.DEFINE_integer(
     "alibaba_loader_task_cpu_multiplier",
@@ -345,8 +391,9 @@
 flags.DEFINE_integer(
     "scheduler_max_time_discretization",
     5,
-    "The maximum discretization that the scheduler can have (in µs). "
-    "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)",
+    "The maximum discretization that the scheduler can have. "
+    "Only used when scheduler_adaptive_discretization flag is enabled. (default: 5)."
+    "Be careful about the EventTime.Unit. Some parts of the code assume Unit.US",
 )
 flags.DEFINE_float(
     "scheduler_max_occupancy_threshold",
@@ -385,9 +432,10 @@
     "scheduler_time_discretization",
     1,
     "The length of each slot in the space-time matrix to consider for scheduling the "
-    "tasks (in µs). The default value is 1µs, and a higher value can lead to faster "
+    "tasks. The default value is 1 (see note for unit), and a higher value can lead to faster "
     "solutions but a potentially lower goodput due to resources being blocked for the "
-    "entirety of the slot.",
+    "entirety of the slot. NOTE: Since time in the simulator is an abstract concept, be "
+    "careful about the EventTime.Unit. Some parts of the code might assume Unit.US",
 )
 flags.DEFINE_enum(
     "scheduler_policy",
@@ -473,7 +521,7 @@
     "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.",
 )
 flags.DEFINE_multi_enum(
-    "optimization_passes",
+    "opt_passes",
     [],
     [
         "CRITICAL_PATH_PASS",
@@ -633,6 +681,8 @@ def main(args):
                 ),
                 flags=FLAGS,
             )
+        elif FLAGS.replay_trace == "tpch":
+            workload_loader = TpchWorkloadLoader(flags=FLAGS)
         else:
             raise NotImplementedError(
                 f"Replay trace {FLAGS.replay_trace} is not implemented yet."
diff --git a/profiles/workers/tpch_cluster.yaml b/profiles/workers/tpch_cluster.yaml
new file mode 100644
index 00000000..582302b2
--- /dev/null
+++ b/profiles/workers/tpch_cluster.yaml
@@ -0,0 +1,6 @@
+- name: WorkerPool_1
+  workers:
+      - name: Worker_1_1
+        resources:
+            - name: Slot
+              quantity: 640
diff --git a/requirements.txt b/requirements.txt
index f3e8957c..4be1c543 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,3 +14,4 @@ cplex
 pre-commit
 black
 isort
+networkx
diff --git a/rpc/README.md b/rpc/README.md
index 294e2287..dcb13dbe 100644
--- a/rpc/README.md
+++ b/rpc/README.md
@@ -5,26 +5,26 @@ The package provides support for connecting frameworks to the ERDOS Simulator, w
 
 This code is being tested with Apache Spark v3.5.0 (with additional instrumentation outlined in [this](https://github.com/dhruvsgarg/spark_mirror/tree/erdos-spark-integration) repository)
 
-To get the RPC service setup, first install the required packages using:
+To get the RPC service setup, from the ERDOS root directory, install the required packages using:
 
 ```bash
-pip install -r requirements.txt
+pip install -r rpc/requirements.txt
 ```
 
 Then, run protoc to generate the service and message definitions using:
 
 ```bash
-python -m grpc_tools.protoc -I./protos --python_out=./ --grpc_python_out=./ ./protos/erdos_scheduler.proto
+python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto
 ```
 
 and run the service using:
 
 ```bash
-python service.py
+python -m rpc.service
 ```
 
 You can also find the supported flags by the service, by running
 
 ```bash
-python service.py --help
+python -m rpc.service --help
 ```
diff --git a/rpc/launch_tpch_queries.py b/rpc/launch_tpch_queries.py
new file mode 100644
index 00000000..2d0f91bf
--- /dev/null
+++ b/rpc/launch_tpch_queries.py
@@ -0,0 +1,256 @@
+import argparse
+import os
+import random
+import subprocess
+import sys
+import time
+import numpy as np
+
+from pathlib import Path
+
+from workload import JobGraph
+from utils import EventTime
+from data.tpch_loader import make_release_policy
+from rpc import erdos_scheduler_pb2
+from rpc import erdos_scheduler_pb2_grpc
+
+import grpc
+
+
+def map_dataset_to_deadline(dataset_size):
+    # 50gb => 2mins, 100gb => 6mins, 250gb => 12mins, 500gb => 24mins
+    mapping = {"50": 120, "100": 360, "250": 720, "500": 1440}
+    return mapping.get(dataset_size, 120)  # Default to 120s if dataset size is NA
+
+
+def launch_query(query_number, index, args):
+    deadline = map_dataset_to_deadline(args.dataset_size)
+
+    cmd = [
+        f"{args.spark_mirror_path.resolve()}/bin/spark-submit",
+        *("--deploy-mode", "cluster"),
+        *("--master", f"spark://{args.spark_master_ip}:7077"),
+        *("--conf", "'spark.port.maxRetries=132'"),
+        *("--conf", "'spark.eventLog.enabled=true'"),
+        *("--conf", f"'spark.eventLog.dir={args.spark_eventlog_dir.resolve()}'"),
+        *("--conf", "'spark.sql.adaptive.enabled=false'"),
+        *("--conf", "'spark.sql.adaptive.coalescePartitions.enabled=false'"),
+        *("--conf", "'spark.sql.autoBroadcastJoinThreshold=-1'"),
+        *("--conf", "'spark.sql.shuffle.partitions=1'"),
+        *("--conf", "'spark.sql.files.minPartitionNum=1'"),
+        *("--conf", "'spark.sql.files.maxPartitionNum=1'"),
+        *("--conf", f"'spark.app.deadline={deadline}'"),
+        *("--class", "'main.scala.TpchQuery'"),
+        f"{args.tpch_spark_path.resolve()}/target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar",
+        f"{query_number}",
+        f"{index}",
+        f"{args.dataset_size}",
+        f"{args.max_cores}",
+    ]
+
+    # print(
+    #     f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Launching Query: {query_number}, "
+    #     f"dataset: {args.dataset_size}GB, deadline: {deadline}s, maxCores: {args.max_cores}"
+    # )
+
+    try:
+        cmd = " ".join(cmd)
+        print("Launching:", cmd)
+        p = subprocess.Popen(
+            cmd,
+            shell=True,
+        )
+        print("Query launched successfully.")
+        return p
+    except Exception as e:
+        print(f"Error launching query: {e}")
+
+
+def generate_release_times(rng, args):
+    if args.distribution == "periodic":
+        release_policy_args = {
+            "period": EventTime(args.period, EventTime.Unit.US),
+        }
+    elif args.distribution == "fixed":
+        release_policy_args = {
+            "period": EventTime(args.period, EventTime.Unit.US),
+            "num_invocations": args.num_queries,
+        }
+    elif args.distribution == "poisson":
+        release_policy_args = {
+            "rate": args.variable_arrival_rate,
+            "num_invocations": args.num_queries,
+        }
+    elif args.distribution == "gamma":
+        release_policy_args = {
+            "rate": args.variable_arrival_rate,
+            "num_invocations": args.num_queries,
+            "coefficient": args.coefficient,
+        }
+    elif args.distribution == "fixed_gamma":
+        release_policy_args = {
+            "variable_arrival_rate": args.variable_arrival_rate,
+            "base_arrival_rate": args.base_arrival_rate,
+            "num_invocations": args.num_queries,
+            "coefficient": args.coefficient,
+        }
+    else:
+        raise NotImplementedError(
+            f"Release policy {args.distribution} not implemented."
+        )
+
+    release_policy = make_release_policy(
+        args.distribution,
+        release_policy_args,
+        rng,
+        args.rng_seed,
+        (args.randomize_start_time_min, args.randomize_start_time_max),
+    )
+
+    release_times = release_policy.get_release_times(
+        completion_time=EventTime(sys.maxsize, EventTime.Unit.US)
+    )
+
+    return release_times
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Generate a workload of queries based on distribution type."
+    )
+    parser.add_argument(
+        "--spark-mirror-path",
+        type=Path,
+        required=True,
+        help="Path to spark-mirror repository",
+    )
+    parser.add_argument(
+        "--spark-master-ip",
+        type=str,
+        required=True,
+        help="IP address of node running Spark master",
+    )
+    parser.add_argument(
+        "--tpch-spark-path",
+        type=Path,
+        required=True,
+        help="Path to TPC-H Spark repository",
+    )
+    parser.add_argument(
+        "--spark-eventlog-dir",
+        default=Path(os.getcwd()) / "spark-eventlog",
+        type=Path,
+        help="Path to directory in which to Spark event logs will be dumped",
+    )
+    parser.add_argument(
+        "--distribution",
+        choices=["periodic", "fixed", "poisson", "gamma", "closed_loop", "fixed_gamma"],
+        default="gamma",
+        help="Type of distribution for query inter-arrival times (default: gamma)",
+    )
+    parser.add_argument(
+        "--num_queries",
+        type=int,
+        default=50,
+        help="Number of queries to generate (default: 50)",
+    )
+    parser.add_argument(
+        "--dataset_size",
+        choices=["50", "100", "250", "500"],
+        default="50",
+        help="Dataset size per query in GB (default: 50)",
+    )
+    parser.add_argument(
+        "--max_cores",
+        type=int,
+        choices=[50, 75, 100, 200],
+        default=50,
+        help="Maximum executor cores (default: 50)",
+    )
+    parser.add_argument(
+        "--period",
+        type=int,
+        default=25,
+        help="Releases a DAG after period time has elapsed",
+    )
+    parser.add_argument(
+        "--variable_arrival_rate",
+        type=float,
+        default=1.0,
+        help="Variable arrival rate for poisson and gamma distributions",
+    )
+    parser.add_argument(
+        "--coefficient",
+        type=float,
+        default=1.0,
+        help="Coefficient for poisson and gamma distributions",
+    )
+    parser.add_argument(
+        "--base_arrival_rate",
+        type=float,
+        default=1.0,
+        help="Base arrival rate for fixed_gamma distribution",
+    )
+    parser.add_argument("--randomize_start_time_min", type=int, default=0)
+    parser.add_argument("--randomize_start_time_max", type=int, default=0)
+    parser.add_argument(
+        "--rng_seed",
+        type=int,
+        default=1234,
+        help="RNG seed for generating inter-arrival periods and picking DAGs (default: 1234)",
+    )
+    parser.add_argument(
+        "--queries", type=int, nargs="+", help="Launch specific queries"
+    )
+
+    args = parser.parse_args()
+
+    if not args.spark_eventlog_dir.exists():
+        args.spark_eventlog_dir.mkdir(parents=True)
+
+    os.environ["TPCH_INPUT_DATA_DIR"] = str(args.tpch_spark_path.resolve() / "dbgen")
+
+    if args.queries:
+        assert len(queries) == args.num_queries
+
+    rng = random.Random(args.rng_seed)
+
+    # Generate release times
+    release_times = generate_release_times(rng, args)
+    print("Release times:", release_times)
+
+    # Launch queries
+    ps = []
+    inter_arrival_times = [release_times[0].time]
+    for i in range(len(release_times) - 1):
+        inter_arrival_times.append(release_times[i + 1].time - release_times[i].time)
+    for i, inter_arrival_time in enumerate(inter_arrival_times):
+        time.sleep(inter_arrival_time)
+        if args.queries:
+            query_number = args.queries[i]
+        else:
+            query_number = rng.randint(1, 22)
+        ps.append(launch_query(query_number, i, args))
+        print(
+            f"({i+1}/{len(release_times)})",
+            "Current time: ",
+            time.strftime("%Y-%m-%d %H:%M:%S"),
+            " launching query: ",
+            query_number,
+        )
+
+    for p in ps:
+        p.wait()
+
+    # Wait for some time before sending the shutdown signal
+    time.sleep(20)
+
+    channel = grpc.insecure_channel("localhost:50051")
+    stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel)
+    response = stub.Shutdown(erdos_scheduler_pb2.Empty())
+    channel.close()
+    print("Sent shutdown signal to the service")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/rpc/protos/erdos_scheduler.proto b/rpc/protos/rpc/erdos_scheduler.proto
similarity index 98%
rename from rpc/protos/erdos_scheduler.proto
rename to rpc/protos/rpc/erdos_scheduler.proto
index 494f5b49..262254da 100644
--- a/rpc/protos/erdos_scheduler.proto
+++ b/rpc/protos/rpc/erdos_scheduler.proto
@@ -47,6 +47,8 @@ service SchedulerService {
 
     /// Notifies the Scheduler that a Task from a particular TaskGraph has completed.option
     rpc NotifyTaskCompletion(NotifyTaskCompletionRequest) returns (NotifyTaskCompletionResponse) {}
+
+    rpc Shutdown(Empty) returns (Empty) {}
 }
 
 
@@ -199,4 +201,7 @@ message GetPlacementsResponse {
     bool success = 1;
     repeated Placement placements = 2;
     string message = 3;
+    bool terminate = 4; // terminate the task graph
 }
+
+message Empty {}
diff --git a/rpc/service.py b/rpc/service.py
index 2aaa2dc9..f4a7d504 100644
--- a/rpc/service.py
+++ b/rpc/service.py
@@ -1,39 +1,35 @@
-import asyncio
-import heapq
-import os
+import threading
 import sys
 import time
-from collections import defaultdict
+import asyncio
 from concurrent import futures
-from operator import attrgetter
-from typing import Mapping, Sequence
 from urllib.parse import urlparse
+from typing import Optional, Dict, Callable, Tuple
+from enum import Enum
+from dataclasses import dataclass
 
-sys.path.append(
-    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
-)
-
-import erdos_scheduler_pb2
-import erdos_scheduler_pb2_grpc
-import grpc
-from absl import app, flags
-from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph
-
-from schedulers import EDFScheduler, FIFOScheduler
-from utils import EventTime, setup_logging
+# TODO: refactor out the need to import main to get common flags
+import main
+from simulator import Simulator, Event, EventTime, EventType
 from workers import Worker, WorkerPool, WorkerPools
 from workload import (
-    ExecutionStrategies,
-    ExecutionStrategy,
-    Job,
-    Placement,
     Resource,
     Resources,
-    Task,
-    TaskGraph,
     Workload,
-    WorkProfile,
+    TaskGraph,
+    TaskState,
+    Placement,
+    JobGraph,
 )
+from data import BaseWorkloadLoader
+from data.tpch_loader import TpchLoader
+from utils import setup_logging, setup_csv_logging
+from rpc import erdos_scheduler_pb2
+from rpc import erdos_scheduler_pb2_grpc
+
+import grpc
+
+from absl import app, flags
 
 FLAGS = flags.FLAGS
 
@@ -41,13 +37,6 @@
 flags.DEFINE_integer(
     "max_workers", 10, "Maximum number of workers to use for the RPC server."
 )
-flags.DEFINE_string("log_file", None, "Path to the log file.", short_name="log")
-flags.DEFINE_string("log_level", "debug", "The level to log.")
-flags.DEFINE_integer(
-    "initial_executors",
-    10,
-    "The initial number of executors that are requested by each application.",
-)
 flags.DEFINE_integer(
     "virtualized_cores",
     500,
@@ -63,809 +52,829 @@
     "The amount of virtualized memory (in GB) that must be created in each Worker on "
     "the framework. Refer to the `virtualized_cores` flag for more information.",
 )
-flags.DEFINE_enum(
-    "scheduler", "EDF", ["FIFO", "EDF"], "The scheduler to use for this execution."
+flags.DEFINE_integer(
+    "spark_app_num_initial_executors",
+    10,
+    "The initial number of executors that are requested by each Spark application.",
+)
+flags.DEFINE_bool(
+    "override_worker_cpu_count",
+    False,
+    "If True, worker CPU count will be set to 640 (Cloudlab 20-node cluster CPU count). "
+    "This allows us to scale up spark experiments without actually deploying a large "
+    "spark cluster.",
 )
 
 
-# Define an item containing completion timestamp and task
-class TimedItem:
-    def __init__(self, timestamp, task):
-        self.timestamp = timestamp
-        self.task = task
+class DataLoader(Enum):
+    TPCH = "tpch"
 
 
-# Define a priority queue based on heapq module
-class PriorityQueue:
-    def __init__(self):
-        self._queue = []
+class WorkloadLoader(BaseWorkloadLoader):
+    def __init__(self, _flags) -> None:
+        self._workload = Workload.empty(_flags)
 
-    def put(self, item):
-        heapq.heappush(self._queue, (item.timestamp, item))
+    def add_task_graph(self, task_graph: TaskGraph):
+        self._workload.add_task_graph(task_graph)
 
-    def get(self):
-        _, item = heapq.heappop(self._queue)
-        return item
+    def get_next_workload(self, current_time: EventTime) -> Optional[Workload]:
+        return self._workload
 
-    def empty(self):
-        return len(self._queue) == 0
 
+@dataclass
+class RegisteredApplication:
+    """
+    Represents a registered application that can be used to generate task
+    graphs. It also manages the mapping between Spark stage IDs and canonical
+    task IDs.
 
-# Implement the service.
-class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer):
-    def __init__(self) -> None:
-        """Initialize the service, and setup the logger."""
-        # Values used by the Servicer.
-        self._logger = setup_logging(name=FLAGS.log_file, log_level=FLAGS.log_level)
-        self._initialized = False
-        self._initialization_time = -1
-        self._master_uri = None
+    A registered application is ready if the `task_graph` attribute is set.
 
-        # The simulator types maintained by the Servicer.
-        self._worker_pool = None
-        self._worker_pools = None
-        self._drivers: Mapping[str, Task] = {}
-        self._workload = None
-
-        # Scheduler information maintained by the servicer.
-        self._scheduler_running_lock = asyncio.Lock()
-        self._scheduler_running = False
-        self._rerun_scheduler = False
-        if FLAGS.scheduler == "EDF":
-            self._scheduler = EDFScheduler()
-        elif FLAGS.scheduler == "FIFO":
-            self._scheduler = FIFOScheduler()
-        else:
-            raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.")
+    Attributes:
+        gen (Callable[[EventTime], Tuple[TaskGraph, Dict[int,int]]]):
+            A function that takes a release time and outputs:
+            - A task graph
+            - A mapping from Spark stage IDs to canonical task IDs
 
-        # Placement information maintained by the servicer.
-        # The placements map the application IDs to the Placement retrieved from the
-        # scheduler. The placements are automatically clipped at the time of informing
-        # the framework of applying them to the executors.
-        # NOTE (Sukrit): This must always be sorted by the Placement time.
-        self._placements: Mapping[str, Sequence[Placement]] = defaultdict(list)
+        task_graph (TaskGraph, optional):
+            The generated task graph for the application. Defaults to None.
 
-        # Additional task information maintained by the servicer
-        self._tasks_marked_for_completion = PriorityQueue()
+    Methods:
+        generate_task_graph(release_time: EventTime):
+            Sets the `task_graph` attribute by generating a task graph for a
+            given `release_time`.
 
-        # Start the asyncio loop for clearing out pending tasks for completion
-        asyncio.create_task(self.PopTasksBasedOnTime())
+        spark_task_id(task_id: int):
+            Returns the canonical task ID corresponding to a Spark stage ID.
 
-        super().__init__()
+        canonical_task_id(stage_id: int):
+            Returns the Spark stage ID corresponding to a canonical task ID.
+    """
 
-    async def schedule(self) -> None:
-        """Schedules the tasks that have been added to the Workload."""
-        async with self._scheduler_running_lock:
-            if self._scheduler_running:
-                self._logger.error(
-                    "Scheduler already running, this should never be reached."
-                )
-                return
-            self._scheduler_running = True
-
-        current_time = EventTime(int(time.time()), EventTime.Unit.S)
-        self._logger.info(
-            "Starting a scheduling cycle with %s TaskGraphs and %s Workers at %s.",
-            len(self._workload.task_graphs),
-            len(self._worker_pool.workers),
-            current_time,
-        )
+    gen: Callable[[EventTime], Tuple[TaskGraph, Dict[int, int]]]
+    task_graph: TaskGraph = None
 
-        # TODO (Sukrit): Change this to a better implementation.
-        # Let's do some simple scheduling for now, that gives a fixed number of
-        # executors to all the available applications in intervals of 10 seconds.
-        if len(self._workload.task_graphs) >= 2:
-            placements = self._scheduler.schedule(
-                sim_time=current_time,
-                workload=self._workload,
-                worker_pools=self._worker_pools,
-            )
-            # Filter the placements that are not of type PLACE_TASK and that have not
-            # been placed.
-            filtered_placements = filter(
-                lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK
-                and p.is_placed(),
-                placements,
-            )
-            for placement in sorted(
-                filtered_placements, key=attrgetter("placement_time")
-            ):
-                self._placements[placement.task.task_graph].append(placement)
-                # Schedule the task here since marking it as running requires it to be
-                # scheduled before. We mark it to be running when we inform the
-                # framework of the placement.
-                placement.task.schedule(
-                    time=placement.placement_time,
-                    placement=placement,
-                )
+    _forward: Dict[int, int] = None  # spark stage id => canonical task id
+    _backward: Dict[int, int] = None  # canonical task id => spark stage id
+    _last_gen: EventTime = None
+
+    def __init__(self, gen):
+        self.gen = gen
+
+    def generate_task_graph(self, release_time: EventTime):
+        task_graph, stage_id_mapping = self.gen(release_time)
+        self.task_graph = task_graph
+        self._forward = stage_id_mapping
+        self._backward = {v: k for k, v in self._forward.items()}
+        self._last_gen = release_time
+
+    def spark_task_id(self, task_id: int):
+        return self._backward[task_id]
+
+    def canonical_task_id(self, stage_id: int):
+        return self._forward[stage_id]
+
+
+class Servicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer):
+    def __init__(self, server) -> None:
+        self._server = server
+
+        # Override some flags
+
+        # Enable orchestrated mode
+        FLAGS.orchestrated = True
+        # Set scheduler runtime to zero
+        FLAGS.scheduler_runtime = 0
+
+        self._logger = setup_logging(
+            name=__name__,
+            log_dir=FLAGS.log_dir,
+            log_file=FLAGS.log_file_name,
+            log_level=FLAGS.log_level,
+            fmt="[%(asctime)s] {%(funcName)s:%(lineno)d} - %(message)s",
+        )
+        self._csv_logger = setup_csv_logging(
+            name=__name__,
+            log_dir=FLAGS.log_dir,
+            log_file=FLAGS.csv_file_name,
+        )
+        for flag_name in FLAGS:
+            self._csv_logger.debug(
+                f"input_flag,{flag_name},{getattr(FLAGS, flag_name)}"
+            )
 
-        self._logger.info(
-            "Finished the scheduling cycle initiated at %s.", current_time
+        self._master_uri = None
+        self._initialization_time = None
+        self._data_loaders = {}
+        self._data_loaders[DataLoader.TPCH] = TpchLoader(
+            path=FLAGS.tpch_query_dag_spec,
+            flags=FLAGS,
         )
+        self._simulator = None
+        self._workload_loader = None
+
+        # Instantiate the scheduler based on the given flag.
+        self._scheduler = None
+        if FLAGS.scheduler == "FIFO":
+            from schedulers import FIFOScheduler
+
+            self._scheduler = FIFOScheduler(
+                preemptive=FLAGS.preemption,
+                runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US),
+                enforce_deadlines=FLAGS.enforce_deadlines,  # TODO: (DG) Check why this isnt passed in the simulator
+                _flags=FLAGS,
+            )
+        elif FLAGS.scheduler == "EDF":
+            from schedulers import EDFScheduler
+
+            self._scheduler = EDFScheduler(
+                preemptive=FLAGS.preemption,
+                runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US),
+                enforce_deadlines=FLAGS.enforce_deadlines,
+                _flags=FLAGS,
+            )
+        elif FLAGS.scheduler == "TetriSched":
+            from schedulers import TetriSchedScheduler
+
+            finer_discretization = FLAGS.finer_discretization_at_prev_solution
+            self._scheduler = TetriSchedScheduler(
+                preemptive=FLAGS.preemption,
+                runtime=EventTime(FLAGS.scheduler_runtime, EventTime.Unit.US),
+                lookahead=EventTime(FLAGS.scheduler_lookahead, EventTime.Unit.US),
+                enforce_deadlines=FLAGS.enforce_deadlines,
+                retract_schedules=FLAGS.retract_schedules,
+                release_taskgraphs=FLAGS.release_taskgraphs,
+                goal=FLAGS.ilp_goal,
+                time_discretization=EventTime(
+                    FLAGS.scheduler_time_discretization, EventTime.Unit.US
+                ),
+                plan_ahead=EventTime(FLAGS.scheduler_plan_ahead, EventTime.Unit.US),
+                log_to_file=FLAGS.scheduler_log_to_file,
+                adaptive_discretization=FLAGS.scheduler_adaptive_discretization,
+                _flags=FLAGS,
+                max_time_discretization=EventTime(
+                    FLAGS.scheduler_max_time_discretization, EventTime.Unit.US
+                ),
+                max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold,
+                finer_discretization_at_prev_solution=finer_discretization,
+                finer_discretization_window=EventTime(
+                    FLAGS.finer_discretization_window, EventTime.Unit.US
+                ),
+                plan_ahead_no_consideration_gap=EventTime(
+                    FLAGS.scheduler_plan_ahead_no_consideration_gap, EventTime.Unit.US
+                ),
+            )
+        else:
+            raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.")
 
-        # Check if another run of the Scheduler has been requested, and if so, create
-        # a task for it. Otherwise, mark the scheduler as not running.
-        async with self._scheduler_running_lock:
-            self._scheduler_running = False
-            if self._rerun_scheduler:
-                self._rerun_scheduler = False
-                asyncio.create_task(self.schedule())
-
-    async def run_scheduler(self) -> None:
-        """Checks if the scheduler is running, and if not, starts it.
-
-        If the scheduler is already running, we queue up another execution of the
-        scheduler. This execution batches the scheduling requests, and runs the
-        scheduler only once for all the requests."""
-        async with self._scheduler_running_lock:
-            if not self._scheduler_running:
-                asyncio.create_task(self.schedule())
-            else:
-                self._rerun_scheduler = True
+        # TODO: Items in _registered_applications are never deleted right now, needs to be handled.
+        self._registered_applications = {}
+        self._registered_app_drivers = (
+            {}
+        )  # Spark driver id differs from taskgraph name (application id)
+        self._received_shutdown = False
+        self._shutting_down = False
+        self._lock = threading.Lock()
+
+        super().__init__()
 
     async def RegisterFramework(self, request, context):
-        """Registers a new framework with the backend scheduler.
-        This is the entry point for a new instance of Spark / Flink to register
-        itself with the backend scheduler, and is intended as an EHLO.
-        """
-        if self._initialized:
-            self._logger.warning(
-                "Framework already registered at %s with the address %s",
-                self._initialization_time,
-                self._master_uri,
-            )
+        stime = self.__stime()
+
+        if self.__framework_registered():
+            msg = f"[{stime}] Framework already registered at the address {self._master_uri} at timestamp {self._initialization_time}"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.RegisterFrameworkResponse(
                 success=False,
-                message=f"Framework already registered at "
-                f"{self._initialization_time} at the address {self._master_uri}",
+                message=msg,
             )
 
-        # Setup a new Framework instance.
+        t = int(time.time())
         framework_name = request.name
         self._master_uri = request.uri
-        self._initialization_time = request.timestamp
-        self._initialized = True
-        self._logger.info(
-            "Registering framework %s with URI %s at %s",
-            framework_name,
-            self._master_uri,
-            self._initialization_time,
-        )
+        self._initialization_time = EventTime(t, EventTime.Unit.US)
+        stime = self.__stime()
 
-        # Setup the simulator types.
         parsed_uri = urlparse(self._master_uri)
-        self._worker_pool = WorkerPool(name=f"WorkerPool_{parsed_uri.netloc}")
-        self._worker_pools = WorkerPools(worker_pools=[self._worker_pool])
-        self._workload = Workload.from_task_graphs({})
+        worker_pool = WorkerPool(
+            name=f"WorkerPool_{parsed_uri.netloc}",
+            _logger=self._logger,
+        )
+        self._workload_loader = WorkloadLoader(FLAGS)
+
+        self._simulator = Simulator(
+            scheduler=self._scheduler,
+            worker_pools=WorkerPools(
+                [worker_pool]
+            ),  # Maintain only one worker pool in the simulator
+            workload_loader=self._workload_loader,
+            _flags=FLAGS,
+        )
 
-        # Return the response.
-        return erdos_scheduler_pb2.RegisterFrameworkResponse(
-            success=True,
-            message=f"{framework_name} at {self._master_uri} registered successfully!",
+        msg = f"[{stime}] Registered the framework '{framework_name}' with URI {self._master_uri} at UNIX time {self._initialization_time.time}"
+        self._logger.info(msg)
+        return erdos_scheduler_pb2.RegisterFrameworkResponse(success=True, message=msg)
+
+    async def DeregisterFramework(self, request, context):
+        stime = self.__stime()
+
+        if not self.__framework_registered():
+            msg = f"[{stime}] Trying to deregister a framework at {request.uri} but no framework has been registered yet."
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+                success=False, message=msg
+            )
+
+        if self._master_uri != request.uri:
+            msg = f"[{stime}] Trying to deregister the framework at {request.uri} but the registered framework is at {self._master_uri}"
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+                success=False, message=msg
+            )
+
+        self._initialization_time = None
+        self._master_uri = None
+        self._workload_loader = None
+        self._simulator = None
+        msg = f"[{stime}] Successfully deregistered the framework at {request.uri}"
+        self._logger.info(msg)
+        return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+            success=True, message=msg
         )
 
     async def RegisterDriver(self, request, context):
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to register a driver with name %s and id %s, "
-                "but no framework is registered yet.",
-                request.name,
-                request.id,
+        stime = self.__stime()
+
+        if not self.__worker_registered():
+            msg = f"[{stime}] Failed to register driver (id={request.id}) because no worker has been registered yet."
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.RegisterDriverResponse(
+                success=False,
+                message=msg,
             )
+
+        if request.id in self._registered_app_drivers:
+            msg = f"[{stime}] Driver with id '{request.id}' is already registered"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.RegisterDriverResponse(
                 success=False,
-                message="Framework not registered yet.",
-                worker_id="",
+                message=msg,
+                worker_id=self.__get_worker_id(),
             )
 
-        # Create a Task for the Driver, and add it to the list of drivers.
-        # TODO (Sukrit): We drop the memory requirements for now, we should use
-        # them to do multi-dimensional packing using STRL.
-        self._logger.info(
-            "Received a request to register a driver with name %s, URI: %s. "
-            "The driver requires %s cores and %s memory.",
-            request.id,
-            request.uri,
-            request.cores,
-            request.memory,
-        )
-        driver_resources = Resources(
-            resource_vector={Resource(name="Slot_CPU", _id="any"): 1}
-        )
-        driver_job = Job(
-            name=request.id,
-            profile=WorkProfile(
-                name=f"WorkProfile_{request.id}",
-                execution_strategies=ExecutionStrategies(
-                    [
-                        ExecutionStrategy(
-                            resources=driver_resources,
-                            batch_size=1,
-                            # NOTE (Sukrit): Drivers are long running, and have no
-                            # fixed runtime. Setting it to zero helps us unload the
-                            # driver from the Worker whenever we need it.
-                            runtime=EventTime.zero(),
-                        )
-                    ]
-                ),
-            ),
+        # TODO: Update the registered_app_drivers to map the driver id to
+        # application id once the taskgraph is registered.
+        self._registered_app_drivers[request.id] = None
+
+        msg = (
+            f"[{stime}] Successfully registered driver {request.id} for an application."
         )
-        driver = Task(
-            name=request.id,
-            task_graph=request.uri,
-            job=driver_job,
-            deadline=EventTime.invalid(),
+        self._logger.info(msg)
+        return erdos_scheduler_pb2.RegisterDriverResponse(
+            success=True,
+            message=msg,
+            worker_id=self.__get_worker_id(),
         )
-        self._drivers[request.id] = driver
-
-        # Iterate over the Workers and find a Worker that can accomodate the driver.
-        placement_found = False
-        for worker in self._worker_pool.workers:
-            for execution_strategy in driver.available_execution_strategies:
-                if worker.can_accomodate_strategy(execution_strategy):
-                    # This Worker can accomodate the Driver, we assign it here.
-                    placement_found = True
-                    self._worker_pool.place_task(driver, execution_strategy, worker.id)
-
-                    # Update the Task's state and placement information.
-                    placement_time = EventTime(request.timestamp, EventTime.Unit.S)
-                    driver.schedule(
-                        time=placement_time,
-                        placement=Placement(
-                            type=Placement.PlacementType.PLACE_TASK,
-                            computation=driver,
-                            placement_time=placement_time,
-                            worker_pool_id=self._worker_pool.id,
-                            worker_id=worker.id,
-                            strategy=execution_strategy,
-                        ),
-                    )
-                    driver.start(placement_time)
 
-                    # Tell the framework to start the driver.
-                    return erdos_scheduler_pb2.RegisterDriverResponse(
-                        success=True,
-                        message=f"Driver {request.id} registered successfully!",
-                        worker_id=worker.name,
-                    )
+    async def DeregisterDriver(self, request, context):
+        stime = self.__stime()
 
-        if not placement_found:
-            return erdos_scheduler_pb2.RegisterDriverResponse(
+        if request.id not in self._registered_app_drivers:
+            msg = f"[{stime}] Driver id '{request.id}' is not registered or does not exist"
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.DeregisterDriverResponse(
                 success=False,
-                message=f"No Worker can accomodate the driver {request.id} yet.",
-                worker_id="",
+                message=msg,
             )
 
-    async def DeregisterDriver(self, request, context):
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to deregister a driver with id %s, "
-                "but no framework is registered yet.",
-                request.id,
-            )
-            return erdos_scheduler_pb2.DeregisterDriverResponse(
-                success=False, message="Framework not registered yet."
-            )
+        # TODO: Dummy mapping from driver to task graph (application), so task_graph_name is None.
+        # Deletion of taskgraph from registered_applications and driver from registered_app_drivers should be done carefully.
+        task_graph_name = self._registered_app_drivers[request.id]
+        del self._registered_app_drivers[request.id]
 
-        if request.id not in self._drivers:
-            self._logger.warning(
-                "Trying to deregister a driver with id %s, "
-                "but no driver with that id is registered.",
-                request.id,
-            )
-            return erdos_scheduler_pb2.DeregisterDriverResponse(
-                success=False,
-                message=f"Driver with id {request.id} not registered yet.",
+        with self._lock:
+            log_stats_event = Event(
+                event_type=EventType.LOG_STATS,
+                time=self.__stime(),
             )
+            self._simulator._event_queue.add_event(log_stats_event)
+
+        msg = f"[{stime}] Successfully de-registered driver with id {request.id} for task graph {task_graph_name}"
+        self._logger.info(msg)
+
+        if len(self._registered_app_drivers) == 0 and self._received_shutdown:
+            self._logger.info(f"[{stime}] The last driver has been deregistered; finishing simulation")
+            # Signals _tick_simulator() to stop.  Shouldn't be
+            # necessary in principle because after the with block
+            # ends, there shouldn't be any more events left to run,
+            # but doesn't hurt.
+            self._shutting_down = True
+            with self._lock:
+                self._simulator.simulate()
+            await self._server.stop(0)
 
-        # Deregister the driver.
-        driver = self._drivers[request.id]
-        completion_time = EventTime(request.timestamp, EventTime.Unit.S)
-        self._worker_pool.remove_task(completion_time, driver)
-        driver.finish(completion_time)
-        del self._drivers[request.id]
         return erdos_scheduler_pb2.DeregisterDriverResponse(
             success=True,
-            message=f"Driver with id {request.id} deregistered successfully!",
+            message=msg,
         )
 
     async def RegisterTaskGraph(self, request, context):
-        """Registers a new TaskGraph with the backend scheduler.
-        This is the entry point for a new application of Spark to register
-        itself with the backend scheduler, and is intended as an EHLO.
-        """
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to register a task graph with ID %s and name %s, "
-                "but no framework is registered yet.",
-                request.id,
-                request.name,
-            )
+        stime = self.__stime()
+
+        if not self.__worker_registered():
+            msg = f"[{stime}] Failed to register task graph (id={request.id}, name={request.name}) because no worker has been registered yet."
+            self._logger.error(msg)
             return erdos_scheduler_pb2.RegisterTaskGraphResponse(
-                success=False, message="Framework not registered yet.", num_executors=0
+                success=False, message=msg, num_executors=0
             )
 
-        if request.id in self._workload.task_graphs:
-            self._logger.warning(
-                "The application with ID %s and name %s was already registered.",
-                request.id,
-                request.name,
-            )
+        if request.id in self._registered_applications:
+            msg = f"[{stime}] The task graph (id={request.id}, name={request.name}) is already registered"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.RegisterTaskGraphResponse(
-                success=False,
-                message=f"Application ID {request.id} with name {request.name} "
-                f"already registered!",
-                num_executors=0,
-            )
+                success=False, message=msg, num_executors=0
+            )
+
+        # We only support TPCH queries for now
+        if request.name.startswith("TPCH Query"):
+            # Parse request name
+            query_parts = request.name.split()
+            match query_parts:
+                case _, _, query_num, index, dataset_size, max_executors_per_job:
+                    query_num = int(query_num)
+                    dataset_size = int(dataset_size)
+                    max_executors_per_job = int(max_executors_per_job)
+                case _, _, query_num, dataset_size, max_executors_per_job:
+                    query_num = int(query_num)
+                    # default index counts up from 0; incorrect if
+                    # Spark receives jobs out of order
+                    index = str(len(self._registered_applications))
+                    dataset_size = int(dataset_size)
+                    max_executors_per_job = int(max_executors_per_job)
+                case _, _, query_num:
+                    query_num = int(query_num)
+                    index = str(len(self._registered_applications))
+                    dataset_size = FLAGS.tpch_dataset_size
+                    max_executors_per_job = FLAGS.tpch_max_executors_per_job
+                case _:
+                    msg = f"[{stime}] Invalid TPCH query request"
+                    return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+                        success=False, message=msg, num_executors=0
+                    )
 
-        self._logger.info(
-            "Attempting to register application ID %s with name %s",
-            request.id,
-            request.name,
-        )
-        # Check if query is from TPC-H workload.
-        # If yes, retrieve profiled slots and runtime info. If no, use default values
-        is_tpch_query = False
-        tpch_query_all_stage_info = None
-        if request.name.startswith("TPCH_"):
-            is_tpch_query = True
-            # retrieve tasks-per-stage and runtime info based on query number
-            tpch_query_num = request.name.split("TPCH_Q", 1)[1]
-            tpch_query_all_stage_info = get_all_stage_info_for_query(tpch_query_num)
-            same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph(
-                query_num=tpch_query_num, dependencies=request.dependencies
-            )
-
-            # return failure message if not tpch app isnt of same DAG structure
-            if not same_structure:
-                self._logger.warning(
-                    "TPCH application with ID %s and name %s couldn't be registered."
-                    "DAG structure mismatch!",
-                    request.id,
-                    request.name,
+            # Convert request.dependencies to [{name: int, children: [int]}]
+            dependencies = []
+            for dep in request.dependencies:
+                dependencies.append(
+                    {
+                        "name": int(dep.key.id),
+                        "children": [int(c) for c in dep.children_ids],
+                    }
+                )
+
+            # Create a job graph
+            self._logger.debug(str((query_num, index, dataset_size, max_executors_per_job)))
+            try:
+                job_graph, stage_id_mapping = self._data_loaders[
+                    DataLoader.TPCH
+                ].make_job_graph(
+                    id=index,
+                    query_num=query_num,
+                    dependencies=dependencies,
+                    dataset_size=dataset_size,
+                    max_executors_per_job=max_executors_per_job,
+                    runtime_unit=EventTime.Unit.US,
                 )
+            except Exception as e:
+                msg = f"[{stime}] Failed to load TPCH query {query_num}. Exception: {e}"
+                self._logger.error(msg)
                 return erdos_scheduler_pb2.RegisterTaskGraphResponse(
-                    success=False,
-                    message=f"TPCH application ID {request.id} with name {request.name}"
-                    f" couldn't be registered. DAG structure mismatch!",
-                    num_executors=0,
+                    success=False, message=msg, num_executors=0
                 )
 
-        # Construct all the Tasks for the TaskGraph.
-        task_ids_to_task: Mapping[int, Task] = {}
-        default_resource = Resources(
-            resource_vector={Resource(name="Slot_CPU", _id="any"): 20}
-        )
-        default_runtime = EventTime(20, EventTime.Unit.US)
-
-        for task_dependency in request.dependencies:
-            framework_task = task_dependency.key
-            if is_tpch_query:
-                mapped_stage_id = stage_id_mapping[framework_task.id]
-                task_slots = tpch_query_all_stage_info[mapped_stage_id]["num_tasks"]
-                task_runtime = tpch_query_all_stage_info[mapped_stage_id][
-                    "avg_task_duration"
-                ]
-                self._logger.info(
-                    "Creating Task for given app TPCH stage: %s, mapped to "
-                    "original stage id %s, with tasks: %s and avg runtime: %s",
-                    framework_task.id,
-                    mapped_stage_id,
-                    task_slots,
-                    task_runtime,
+            if not self.__can_accomodate_task_graph(job_graph):
+                msg = f"[{stime}] The worker Pool cannot accomodate the task graph '{request.id}'"
+                self._logger.error(msg)
+                return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+                    success=False, message=msg, num_executors=0
                 )
-            task_ids_to_task[framework_task.id] = Task(
-                name=framework_task.name,
-                task_graph=request.id,
-                job=Job(
-                    name=framework_task.name,
-                    profile=WorkProfile(
-                        name=f"WorkProfile_{framework_task.name}",
-                        execution_strategies=ExecutionStrategies(
-                            [
-                                ExecutionStrategy(
-                                    resources=(
-                                        default_resource
-                                        if not is_tpch_query
-                                        else Resources(
-                                            resource_vector={
-                                                Resource(
-                                                    name="Slot_CPU", _id="any"
-                                                ): task_slots
-                                            }
-                                        )
-                                    ),
-                                    batch_size=1,
-                                    runtime=(
-                                        default_runtime
-                                        if not is_tpch_query
-                                        else EventTime(task_runtime, EventTime.Unit.US)
-                                    ),
-                                )
-                            ]
-                        ),
-                    ),
-                ),
-                deadline=EventTime(request.deadline, EventTime.Unit.S),
-                # TODO (Sukrit): We should maintain a counter for each application
-                # type so that we can correlate the Tasks with a particular invocation.
-                timestamp=1,
-            )
-            # NOTE (Sukrit): We maintain the StageID of the Task as a separate field
-            # that is not accessible / used by the Simulator.
-            task_ids_to_task[framework_task.id].stage_id = framework_task.id
-            self._logger.info(
-                "Constructed Task %s for the TaskGraph %s.",
-                framework_task.name,
-                request.id,
-            )
-
-        # Construct the TaskGraph from the Tasks.
-        task_graph_structure: Mapping[Task, Sequence[Task]] = {}
-        for task_dependency in request.dependencies:
-            task_graph_structure[task_ids_to_task[task_dependency.key.id]] = [
-                task_ids_to_task[task_id] for task_id in task_dependency.children_ids
-            ]
-        task_graph = TaskGraph(
-            name=request.id,
-            tasks=task_graph_structure,
-        )
-        self._workload.add_task_graph(task_graph)
-        self._logger.info(
-            "Added the TaskGraph(name=%s, id=%s) to the Workload.",
-            request.name,
-            request.id,
-        )
-        self._logger.info(
-            "The structure of the TaskGraph %s is \n%s.",
-            request.id,
-            str(task_graph),
-        )
 
-        # Return the response.
+            def gen(release_time):
+                task_graph = job_graph.get_next_task_graph(
+                    start_time=release_time,
+                    _flags=FLAGS,
+                )
+                return task_graph, stage_id_mapping
+
+        else:
+            msg = f"[{stime}] The service only supports TPCH queries"
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+                success=False, message=msg, num_executors=0
+            )
+
+        self._registered_applications[request.id] = RegisteredApplication(gen=gen)
+
+        msg = f"[{stime}] Registered task graph '{request.id}' successfully"
+        self._logger.info(msg)
         return erdos_scheduler_pb2.RegisterTaskGraphResponse(
             success=True,
-            message=f"Application ID {request.id} with name "
-            f"{request.name} and deadline {request.deadline} registered successfully!",
-            num_executors=FLAGS.initial_executors,
+            message=msg,
+            num_executors=FLAGS.spark_app_num_initial_executors,
         )
 
     async def RegisterEnvironmentReady(self, request, context):
-        """Registers that the environment (i.e., executors) are ready for the given
-        TaskGraph at the specified time.
-
-        This is intended to release the sources of the TaskGraph to the scheduling
-        backend, to consider the application in this scheduling cycle.
-        """
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to register that the environment is ready for the TaskGraph "
-                "with ID %s, but no framework is registered yet.",
-                request.id,
-            )
-            return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
-                success=False, message="Framework not registered yet."
-            )
-
-        task_graph = self._workload.get_task_graph(request.id)
-        if task_graph is None:
-            self._logger.warning(
-                "Trying to register that the environment is ready for the TaskGraph "
-                "with ID %s, but no TaskGraph with that ID is registered.",
-                request.id,
-            )
-            return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
-                success=False,
-                message=f"TaskGraph with ID {request.id} not registered yet.",
-            )
+        stime = self.__stime()
 
-        if request.num_executors != FLAGS.initial_executors:
-            self._logger.warning(
-                "The TaskGraph %s requires %s executors, but the environment is ready "
-                "with %s executors.",
-                request.id,
-                FLAGS.initial_executors,
-                request.num_executors,
-            )
+        if request.id not in self._registered_applications:
+            msg = f"[{stime}] Task graph of id '{request.id}' is not registered or does not exist"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
                 success=False,
-                message=f"Number of executors not {FLAGS.initial_executors}.",
+                message=msg,
             )
 
-        # Release all the sources of the TaskGraph at the given time.
-        for source_task in task_graph.get_source_tasks():
-            source_task.release(EventTime(request.timestamp, EventTime.Unit.S))
-
-        # Run the scheduler since the Workload has changed.
-        await self.run_scheduler()
+        r = self._registered_applications[request.id]
+        r.generate_task_graph(stime)
 
-        return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
-            success=True,
-            message=f"Environment ready for TaskGraph with ID {request.id}!",
-        )
+        with self._lock:
+            self._simulator._workload.add_task_graph(r.task_graph)
+            self._simulator._current_task_graph_placements[r.task_graph.name] = {}
 
-    async def DeregisterFramework(self, request, context):
-        """Deregisters the framework with the backend scheduler.
-        This is the exit point for a running instance of Spark / Flink to deregister"""
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to deregister the framework at %s, "
-                "but no framework is registered yet.",
-                request.uri,
-            )
-            return erdos_scheduler_pb2.DeregisterFrameworkResponse(
-                success=False, message="Framework not registered yet."
-            )
+            for task in r.task_graph.get_releasable_tasks():
+                task_release_event = Event(
+                    event_type=EventType.TASK_RELEASE,
+                    time=self.__stime(),
+                    task=task,
+                )
+                self._logger.info(
+                    f"[{stime}] Added event {task_release_event} to the simulator's event queue",
+                )
+                self._simulator._event_queue.add_event(task_release_event)
 
-        if not self._master_uri == request.uri:
-            self._logger.warning(
-                "Trying to deregister the framework at %s, "
-                "but the registered framework is at %s.",
-                request.uri,
-                self._master_uri,
+            scheduler_start_event = Event(
+                event_type=EventType.SCHEDULER_START,
+                time=self.__stime(),
             )
-            return erdos_scheduler_pb2.DeregisterFrameworkResponse(
-                success=False,
-                message=f"Framework not registered at {request.uri} yet.",
+            self._simulator._event_queue.add_event(scheduler_start_event)
+            self._logger.info(
+                f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue"
             )
 
-        # Deregister the framework.
-        self._initialization_time = None
-        self._master_uri = None
-        self._initialized = False
-        self._logger.info("Deregistering framework at %s", request.uri)
-        return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+        msg = f"[{stime}] Successfully marked environment as ready for task graph '{r.task_graph.name}'"
+        self._logger.info(msg)
+        return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
             success=True,
-            message=f"Framework at {request.uri} deregistered successfully!",
+            message=msg,
         )
 
     async def RegisterWorker(self, request, context):
-        """Registers a new worker with the backend scheduler."""
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to register a worker with name %s and id %s, "
-                "but no framework is registered yet.",
-                request.name,
-                request.id,
-            )
+        stime = self.__stime()
+
+        if not self.__framework_registered():
+            msg = f"[{stime}] Trying to register a worker (id={request.id}, name={request.name}) but no framework is registered yet"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.RegisterWorkerResponse(
-                success=False, message="Framework not registered yet."
+                success=False, message=msg
             )
 
-        # First, we construct the Resources with the given size.
-        # TODO (Sukrit): Right now, we drop the memory requirements, we should use
+        # TODO(Sukrit): Right now, we drop the memory requirements, we should use
         # them to do multi-dimensional packing using STRL.
-        cpu_resource = Resource(name="Slot_CPU")
-        worker_resources = Resources(resource_vector={cpu_resource: request.cores})
-        self._logger.debug(
-            "Successfully constructed the resources for the worker %s: %s.",
-            request.name,
-            worker_resources,
-        )
 
-        # Construct a new Worker instance, and add it to the WorkerPool.
+        cpu_resource = Resource(name="Slot")
+        worker_resources = Resources(
+            resource_vector={
+                cpu_resource: (
+                    request.cores if not FLAGS.override_worker_cpu_count else 640
+                )
+            },
+            _logger=self._logger,
+        )
         worker = Worker(
             name=request.id,
             resources=worker_resources,
-        )
-        self._worker_pool.add_workers([worker])
-
-        self._logger.info(
-            "Registering worker with name %s, and resources %s.",
-            worker.name,
-            worker_resources,
+            _logger=self._logger,
         )
 
-        # Run the scheduler since the Resource set has changed, and new task graphs
-        # may become eligible to run.
-        await self.run_scheduler()
+        self.__get_worker_pool().add_workers([worker])
 
+        msg = f"[{stime}] Registered worker (id={request.id}, name={request.name})"
+        self._logger.info(msg)
         return erdos_scheduler_pb2.RegisterWorkerResponse(
             success=True,
-            message=f"Worker {request.name} registered successfully!",
+            message=msg,
             cores=FLAGS.virtualized_cores,
             memory=FLAGS.virtualized_memory * 1024,
         )
 
-    async def NotifyTaskCompletion(self, request, context):
-        """Notifies the backend scheduler that a task has completed."""
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to notify the backend scheduler that the task with ID %s "
-                "from application %s has completed, "
-                "but no framework is registered yet.",
-                request.task_id,
-                request.application_id,
+    async def GetPlacements(self, request, context):
+        stime = self.__stime()
+
+        # Check if the task graph is registered
+        if request.id not in self._registered_applications:
+            msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist"
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.GetPlacementsResponse(
+                success=False,
+                message=msg,
             )
-            return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
-                success=False, message="Framework not registered yet."
+
+        r = self._registered_applications[request.id]
+
+        if r.task_graph is None:
+            msg = f"[{stime}] Task graph '{request.id}' is not ready"
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.GetPlacementsResponse(
+                success=True,
+                message=msg,
+                placements=[],
             )
 
-        task_graph = self._workload.get_task_graph(request.application_id)
-        if task_graph is None:
-            self._logger.warning(
-                "Trying to notify the backend scheduler that the task with ID %s "
-                "from application %s has completed, but the application "
-                "was not registered with the backend yet.",
-                request.task_id,
-                request.application_id,
+        if r.task_graph.is_complete():
+            msg = f"[{stime}] Task graph '{r.task_graph.name}' is complete. No more placements to provide."
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.GetPlacementsResponse(
+                success=True,
+                message=msg,
+            )
+
+        # A task graph is considered complete if **all** of its **sink** tasks
+        # are complete. It is considered cancelled if **any** of its **sink**
+        # tasks are cancelled.
+
+        # If the task graph is complete, the Spark application will
+        # automatically shut down because it knows that all of its stages have
+        # finished executing.
+
+        # Matters get interesting in the presence of task cancellations. The
+        # service is aware of which tasks are cancelled.
+
+        # First, even when a task graph is cancelled, the simulator (without
+        # orchestration)
+        # continues to schedule and execute any tasks that were released into
+        # the system. The service, which runs the simulator in orchestrated
+        # mode, must emulate this behavior to maintain parity.
+
+        # Second, from Spark's point of view, however, those tasks are still
+        # pending placements. So, Spark will continue to periodically invoke
+        # `GetPlacements` in the hopes of receiving placements for those
+        # cancelled tasks. Left unhandled, the Spark application will loop
+        # indefinitely waiting for placements.
+
+        # We _could_ communicate these task cancellations to Spark. Then, we
+        # can modify the DAGScheduler to invoke GetPlacements until all of its
+        # stages have either finished executing or have been cancelled, after
+        # which it can safely terminate the application.
+
+        # However, we run into an issue due to VIRTUAL tasks. When a task is
+        # cancelled, the simulator invokes `TaskGraph.cancel(task)`.
+        # `TaskGraph.cancel(task)` traverses the tree rooted at `task`
+        # depth-first, cancelling tasks along the way until it finds the first
+        # terminal task. As a consequence, it is possible for the tree rooted
+        # at a cancelled task to have VIRTUAL tasks inside of it. These
+        # virtual tasks will never receive placements because they are not
+        # releasable. So, it is possible for the Spark application to stall on
+        # `GetPlacements` waiting on placements for these virtual tasks.
+
+        # Since the service knows the state of each task, it is easy then for
+        # the service to determine when the Spark application should terminate
+        # in the presence of task cancellations.
+
+        # So, instead of communicating task cancellations, we communicate when
+        # the Spark application should terminate.
+        #
+        # The first check makes sure all tasks are either CANCELLED,
+        # COMPLETED, or VIRTUAL. We check for all tasks because it is possible
+        # that the simulator is processing released and scheduled tasks. If we
+        # terminate early, then we will never receive `NotifyTaskCompletion`s
+        # for those tasks (because the Spark application was terminated),
+        # which then results in those tasks never getting removed from the
+        # worker pool.
+        #
+        # The second check makes sure that the task graph is indeed cancelled.
+        # We have this additional guard because at the start all tasks are
+        # VIRTUAL and we don't want to terminate the application then.
+        
+        if r.task_graph.is_cancelled():
+            self._logger.error(f"[{stime}] Task graph '{r.task_graph.name}' is in state cancelled.")
+
+        should_terminate = all(
+            task.state
+            in (
+                TaskState.CANCELLED,
+                TaskState.COMPLETED,
+                TaskState.VIRTUAL,
+            )
+            for task in r.task_graph
+        ) and (r.task_graph.is_cancelled())
+        if should_terminate:
+            msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled and simulator has processed all released/ scheduled tasks. Terminating it since it has no more placements to provide."
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.GetPlacementsResponse(
+                success=True,
+                message=msg,
+                terminate=True,
+            )
+        elif r.task_graph.is_cancelled() and not should_terminate:
+            msg = f"[{stime}] Task graph '{r.task_graph.name}' was cancelled but simulator is still processing some released/ scheduled tasks. Will provide placements."
+            self._logger.error(msg)
+        else:
+            msg = f"[{stime}] Task graph '{r.task_graph.name}' is actively running. Will provide placements."
+            self._logger.info(msg)
+
+        with self._lock:
+            sim_placements = self._simulator.get_current_placements_for_task_graph(
+                r.task_graph.name
             )
+
+        placements = []
+        for placement in sim_placements:
+            if placement.task.state != TaskState.RUNNING:
+                self._logger.debug(f"[{stime}] Skipping placement: {placement}")
+                continue
+
+            worker_id = (
+                self.__get_worker_id()
+                if placement.placement_type == Placement.PlacementType.PLACE_TASK
+                else "None"
+            )
+            task_id = r.spark_task_id(placement.task.name)
+            cores = (
+                sum(x for _, x in placement.execution_strategy.resources.resources)
+                if placement.placement_type == Placement.PlacementType.PLACE_TASK
+                else 0
+            )
+
+            if placement.placement_type not in (Placement.PlacementType.PLACE_TASK,):
+                raise NotImplementedError
+
+            placements.append(
+                {
+                    "worker_id": worker_id,
+                    "application_id": request.id,
+                    "task_id": task_id,
+                    "cores": cores,
+                },
+            )
+            
+        msg = f"[{stime}] Returning the following placements {placements} for task graph '{request.id}'."
+        self._logger.info(msg)
+        return erdos_scheduler_pb2.GetPlacementsResponse(
+            success=True,
+            message=msg,
+            placements=placements,
+        )
+
+    async def NotifyTaskCompletion(self, request, context):
+        stime = self.__stime()
+
+        # Check if the task graph is registered
+        if request.application_id not in self._registered_applications:
+            msg = f"[{stime}] Task graph with id '{request.id}' is not registered or does not exist"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
                 success=False,
-                message=f"Application with ID {request.application_id} "
-                f"not registered yet.",
-            )
-
-        # Find the Task that has completed, and mark it as such.
-        matched_task = None
-        for task in task_graph.get_nodes():
-            if task.stage_id == request.task_id:
-                matched_task = task
-        if matched_task is None:
-            self._logger.warning(
-                "Trying to notify the backend scheduler that the task with ID %s "
-                "from application %s has completed, but the task "
-                "was not found in the TaskGraph.",
-                request.task_id,
-                request.application_id,
+                message=msg,
             )
+
+        r = self._registered_applications[request.application_id]
+        task = r.task_graph.get_task(r.canonical_task_id(request.task_id))
+        if task is None:
+            msg = f"[{stime}] Task '{request.task_id}' does not exist in the task graph '{r.task_graph.name}'"
+            self._logger.error(msg)
             return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
                 success=False,
-                message=f"Task with ID {request.task_id} "
-                f"not found in TaskGraph {request.application_id}.",
+                message=msg,
             )
 
-        # Instead of completing & removing the task immediately, check
-        # if it is actually complete or will complete in the future
+        if task.state != TaskState.RUNNING:
+            msg = f"[{stime}] Received task completion notification for task '{request.task_id}' (mapped to '{r.canonical_task_id(request.task_id)}') of '{r.task_graph.name}' but it is not running"
+            self._logger.error(msg)
+            return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
+                success=False,
+                message=msg,
+            )
 
-        # Get the actual task completion timestamp
+        # HACK: The worker pool doesn't step every tick (probably should). So, the task.remaining_time is not accurate. We compute actual_task_completion then by getting the runtime from the profile,
         actual_task_completion_time = (
-            matched_task.start_time.time + matched_task.remaining_time.time
-        )
-
-        current_time = time.time()
-        self._logger.info(
-            "Received task for completion at time: %s , task.start_time: %s ,"
-            "task.remaining_time (=runtime):  %s ,  actual completion time: %s ",
-            round(current_time),
-            matched_task.start_time.time,
-            matched_task.remaining_time.time,
-            actual_task_completion_time,
-        )
-
-        # TODO DG: remaining_time assumes execution of the slowest strategy
-        # Should be updated to reflect correct remaining_time based on chosen strategy?
-
-        # Add all tasks to _tasks_marked_for_completion queue.
-        # If task has actually completed, it will be dequeued immediately
-        # Else it will be dequeued at its actual task completion time
-        self._tasks_marked_for_completion.put(
-            TimedItem(actual_task_completion_time, matched_task)
+            task.start_time + task.slowest_execution_strategy.runtime
         )
 
-        # NOTE: task.finish() and run_scheduler() invocations are postponed
-        # until it is time for the task to be actually marked as complete.
-
-        return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
-            success=True,
-            message=f"Task with ID {request.task_id} marked for completion at "
-            f"{round(current_time)}! It will be removed on actual "
-            f"task completion time at {actual_task_completion_time}",
-        )
-
-    async def GetPlacements(self, request, context):
-        """Retrieves the placements applicable at the specified time."""
-        request_timestamp = EventTime(request.timestamp, EventTime.Unit.S)
-        if not self._initialized:
-            self._logger.warning(
-                "Trying to get placements for %s at time %s, "
-                "but no framework is registered yet.",
-                request.id,
-                request_timestamp,
+        with self._lock:
+            # NOTE: Although the actual_task_completion_time works for task completion notifications that arrive early, it is
+            # inaccurate for task completion notifications that occur past that time. Thus, a max of the current and actual completion time
+            # is taken to ensure that the task is marked completed at the correct time.
+            task_finished_event = Event(
+                event_type=EventType.TASK_FINISHED,
+                time=max(actual_task_completion_time, self.__stime()),
+                task=task,
             )
-            return erdos_scheduler_pb2.GetPlacementsResponse(
-                success=False, message="Framework not registered yet."
+            self._simulator._event_queue.add_event(task_finished_event)
+            self._logger.info(
+                f"[{stime}] Adding event {task_finished_event} to the simulator's event queue"
             )
+            if actual_task_completion_time < self.__stime():
+                self._logger.error(
+                    f"[{stime}] Task '{request.task_id}' of task graph '{r.task_graph.name}' had exceeded its runtime by {self.__stime() - actual_task_completion_time}")
 
-        if request.id not in self._placements:
-            self._logger.warning(
-                "Trying to get placements for %s at time %s, but the application "
-                "was not registered with the backend yet.",
-                request.id,
-                request_timestamp,
+            scheduler_start_event = Event(
+                event_type=EventType.SCHEDULER_START,
+                time=max(
+                    actual_task_completion_time.to(EventTime.Unit.US),
+                    self.__stime(),
+                ),
+            )
+            self._simulator._event_queue.add_event(scheduler_start_event)
+            self._logger.info(
+                f"[{stime}] Added event {scheduler_start_event} to the simulator's event queue"
             )
 
-        # Construct and return the placements.,
-        placements = []
-        clip_at = -1
-        for index, placement in enumerate(self._placements[request.id]):
-            if placement.placement_time <= request_timestamp:
-                clip_at = index
-                # Mark the Task as RUNNING.
-                placement.task.start(request_timestamp)
-
-                # resources = placement.execution_strategy.resources
-                placements.append(
-                    erdos_scheduler_pb2.Placement(
-                        worker_id=placement.worker_id,
-                        application_id=request.id,
-                        task_id=placement.task.stage_id,
-                        cores=1,
-                    )
-                )
-        self._placements[request.id] = self._placements[request.id][clip_at + 1 :]
-        self._logger.info(
-            "Constructed %s placements at time %s for application with ID %s.",
-            len(placements),
-            request.timestamp,
-            request.id,
-        )
-        return erdos_scheduler_pb2.GetPlacementsResponse(
+        msg = f"[{stime}] Successfully processed completion of task '{request.task_id}' of task graph '{r.task_graph.name}'"
+        self._logger.info(msg)
+        return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
             success=True,
-            placements=placements,
-            message=f"Constructed {len(placements)} "
-            f"placements at time {request.timestamp}.",
+            message=msg,
         )
 
-    # Function to pop tasks from queue based on actual completion time
-    async def PopTasksBasedOnTime(self):
-        while True:
-            if not self._tasks_marked_for_completion.empty():
-                # Get the top item from the priority queue
-                top_item = self._tasks_marked_for_completion._queue[0][1]
-
-                # Check if top item's timestamp is reached or passed by current time
-                current_time = time.time()
-                if top_item.timestamp <= current_time:
-                    # Pop the top item
-                    popped_item = self._tasks_marked_for_completion.get()
-                    self._logger.info(
-                        "Removing tasks from pending completion queue: %s at time: %s",
-                        popped_item.task,
-                        current_time,
-                    )
+    async def Shutdown(self, request, context):
+        self._received_shutdown = True
+        return erdos_scheduler_pb2.Empty()
+
+    async def _tick_simulator(self):
+        while not self._shutting_down:
+            with self._lock:
+                if self._simulator is not None:
+                    stime = self.__stime()
+                    # self._logger.debug(f"[{stime}] Simulator tick")
+                    self._simulator.tick(until=stime)
+            # else:
+            #     print("Simulator instance is None")
+            await asyncio.sleep(1)
+
+    def __stime(self) -> EventTime:
+        """
+        Time as viewed by the service. Starts when a framework is registered
+        and ends when it is deregistered.
+        """
+        if self._initialization_time is None:
+            return EventTime.invalid()
+        ts = int(time.time())
+        # NOTE: The service runs in the US time unit for better compatibility with the simulator.
+        # The simulator uses an abstract unit of time, and it is all relative.
+        ts = EventTime(ts, EventTime.Unit.US)
+        return ts - self._initialization_time
+
+    def __framework_registered(self):
+        return self._simulator is not None
+
+    def __worker_registered(self):
+        return (
+            self.__framework_registered() and len(self.__get_worker_pool().workers) > 0
+        )
 
-                    # Mark the Task as completed.
-                    # Also release the task from the scheduler service
-                    popped_item.task.update_remaining_time(EventTime.zero())
-                    popped_item.task.finish(
-                        EventTime(round(current_time), EventTime.Unit.S)
-                    )
+    def __get_worker_pool(self):
+        # Simulator maintains only one worker pool, so this should be fine
+        return next(iter(self._simulator._worker_pools.worker_pools))
 
-                    # Run the scheduler since the Workload has changed.
-                    await self.run_scheduler()
+    def __get_worker(self):
+        return self.__get_worker_pool().workers[0]
 
-                else:
-                    # If the top item's timestamp hasn't been reached yet,
-                    # sleep for a short duration
-                    await asyncio.sleep(0.1)  # TODO: Can adjust value, curr=0.1s
-            else:
-                # If the queue is empty, sleep for a short duration
-                await asyncio.sleep(0.1)  # TODO: Can adjust value, curr=0.1s
+    def __get_worker_id(self):
+        # We return the name here because we register the worker id from
+        # Spark as the name of the worker in the worker pool
+        return self.__get_worker().name
 
+    def __can_accomodate_task_graph(self, job_graph: JobGraph):
+        worker_resources = self.__get_worker().resources
+        for job in job_graph:
+            for strat in job.execution_strategies:
+                for resource, quantity in strat.resources.resources:
+                    if worker_resources.get_total_quantity(resource) < quantity:
+                        return False
+        return True
 
-async def serve():
-    """Serves the ERDOS Scheduling RPC Server."""
-    # Initialize the server.
-    server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers))
-    erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(
-        SchedulerServiceServicer(), server
-    )
 
-    # Start the server.
-    server.add_insecure_port(f"[::]:{FLAGS.port}")
+async def serve(server):
     await server.start()
-    print("Initialized ERDOS Scheduling RPC Server on port", FLAGS.port)
+    print("Initialized ERDOS RPC Service on port", FLAGS.port)
     await server.wait_for_termination()
 
 
-def main(argv):
-    # Create an asyncio event loop
+def main(_argv):
     loop = asyncio.get_event_loop()
 
-    # Run the event loop until serve() completes
+    server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers))
+    servicer = Servicer(server)
+    erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(servicer, server)
+    server.add_insecure_port(f"[::]:{FLAGS.port}")
+
+    # Schedule the periodic tick_simulator task
+    loop.create_task(servicer._tick_simulator())
+
     try:
-        loop.run_until_complete(serve())
+        loop.run_until_complete(serve(server))
+    except KeyboardInterrupt:
+        print("Terminated ERDOS RPC Service")
     finally:
         loop.close()
 
diff --git a/rpc/service_old.py b/rpc/service_old.py
new file mode 100644
index 00000000..6629ebc7
--- /dev/null
+++ b/rpc/service_old.py
@@ -0,0 +1,1847 @@
+import asyncio
+import heapq
+import math
+import os
+import random
+import sys
+import time
+from concurrent import futures
+from operator import attrgetter
+from typing import Dict, Mapping, Sequence
+from urllib.parse import urlparse
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
+
+import erdos_scheduler_pb2
+import erdos_scheduler_pb2_grpc
+import grpc
+from absl import app, flags
+from tpch_utils import get_all_stage_info_for_query, verify_and_relable_tpch_app_graph
+
+from schedulers import EDFScheduler, FIFOScheduler, TetriSchedScheduler
+from utils import EventTime, setup_logging
+from workers import Worker, WorkerPool, WorkerPools
+from workload import (
+    ExecutionStrategies,
+    ExecutionStrategy,
+    Job,
+    Placement,
+    Resource,
+    Resources,
+    Task,
+    TaskGraph,
+    TaskState,
+    Workload,
+    WorkProfile,
+)
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_integer("port", 50051, "Port to serve the ERDOS Scheduling RPC Server on.")
+flags.DEFINE_integer(
+    "max_workers", 10, "Maximum number of workers to use for the RPC server."
+)
+flags.DEFINE_string("log_file_name", None, "Name of the log file.", short_name="log")
+flags.DEFINE_string("log_level", "debug", "The level to log.")
+flags.DEFINE_integer(
+    "initial_executors",
+    10,
+    "The initial number of executors that are requested by each application.",
+)
+flags.DEFINE_float(
+    "spark_task_duration_multiplier",
+    1,
+    "The multiplier used for spark job task runtimes. Buffer time is added "
+    "to ensure that tasks complete before the scheduler expects it to complete. "
+    "Completion of tasks after the scheduler's expected task completion time "
+    "is detrimental for scheduler's planning and could invalidate some schedules",
+)
+flags.DEFINE_integer(
+    "virtualized_cores",
+    500,
+    "The number of virtualized cores that must be created in each Worker on the "
+    "framework. This allows us to spawn a higher number of executors than the number "
+    "possible with actual available resources. Thus, we can spawn the executors for "
+    "each application, and only selectively activate them according to the actual "
+    "available resources.",
+)
+flags.DEFINE_integer(
+    "virtualized_memory",
+    500,
+    "The amount of virtualized memory (in GB) that must be created in each Worker on "
+    "the framework. Refer to the `virtualized_cores` flag for more information.",
+)
+flags.DEFINE_enum(
+    "scheduler", "DAGSched", ["FIFO", "EDF", "DAGSched"], "The scheduler to use for "
+    "this execution."
+)
+flags.DEFINE_enum(
+    "tpch_profile_type", "Cloudlab", ["Decima", "Cloudlab"], "The set of profiles to "
+    "use for execution of tpch queries. Note that Cloudlab profile has all 22 queries. "
+    "From the Decima profile we support only 15 queries (1-10, 12-14, 16, 19). The "
+    "rest might also run but DAG structure might not match Decima profiles."
+)
+flags.DEFINE_enum(
+    "tpch_dataset_size", "50", ["50", "100", "250", "500"], "Options in GB eg. 50g for "
+    "dataset size of TPCH query. The Cloudlab profile will be picked accordingly. "
+)
+flags.DEFINE_enum(
+    "tpch_max_executors_per_job", "50", ["50", "75", "100", "200"], "Options for "
+    "max executors to use for tpch queries. The Cloudlab profile will be picked "
+    "accordingly."
+)
+flags.DEFINE_bool(
+    "override_worker_cpu_count",
+    False,
+    "If True, worker CPU count will be set to INT_MAX. This allows us to scale up "
+    "spark experiments without actually deploying a large spark cluster.",
+)
+flags.DEFINE_bool(
+    "use_profile_to_scale_executors",
+    False,
+    "If True, it means that a fixed number of (max) executors was given to the "
+    "spark job to run. With this profile, we can directly use the profiled "
+    "stage runtime, while setting the number of required slots or executors "
+    "to 1 per stage. This allows us do the same scheduling but creates less "
+    "overhead for this rpc service while running the experiments.",
+)
+flags.DEFINE_bool(
+    "release_taskgraphs",
+    False,
+    "If True, all tasks from a graph are released if any of the tasks have "
+    "reached their release time.",
+)
+flags.DEFINE_bool(
+    "enforce_deadlines",
+    False,
+    "True if the ILP formulation must ensure that deadlines are met.",
+)
+flags.DEFINE_integer(
+    "scheduler_time_discretization",
+    1,
+    "The length of each slot in the space-time matrix to consider for scheduling the "
+    "tasks (in µs). The default value is 1µs, and a higher value can lead to faster "
+    "solutions but a potentially lower goodput due to resources being blocked for the "
+    "entirety of the slot.",
+)
+flags.DEFINE_bool(
+    "scheduler_enable_optimization_pass",
+    False,
+    "If `True`, the scheduler runs pre/post-translation optimization passes"
+    "when registering STRL expression.",
+)
+flags.DEFINE_float(
+    "scheduler_reconsideration_period",
+    0.1,
+    "The percentage of critical path duration until which the scheduler will try "
+    "placing the TaskGraph, and drop the TaskGraph if it cannot be placed after.",
+)
+flags.DEFINE_bool(
+    "retract_schedules", False, "Enable the retraction of previously decided schedules."
+)
+flags.DEFINE_integer(
+    "scheduler_time_limit",
+    3,
+    "The time limit (in seconds) to allow the scheduler to keep "
+    "searching for solutions without finding a better one.",
+)
+flags.DEFINE_bool(
+    "scheduler_dynamic_discretization",
+    False,
+    "If `True`, the scheduler creates space-time matrix non-uniformly. "
+    "The discretization is dynamically decided based on the occupancy request for "
+    "each time slice. (default: False)",
+)
+flags.DEFINE_integer(
+    "scheduler_max_time_discretization",
+    8,
+    "The maximum discretization that the scheduler can have (in µs). "
+    "Only used when scheduler_adaptive_discretization flag is enabled. (default: 8)",
+)
+flags.DEFINE_float(
+    "scheduler_max_occupancy_threshold",
+    0.8,
+    "The percentage b/w 0 and 1 of maximum occupancy beyond which the discretization "
+    "would always be 1 incase of dynamic discretization. "
+    "This flag is only used when dynamic discretization is enabled (default: 0.8)",
+)
+flags.DEFINE_bool(
+    "finer_discretization_at_prev_solution",
+    False,
+    "If `True`, the scheduler keeps discretization of 1 around previous solution. "
+    "The discretization is dynamically decided based on the occupancy request for "
+    "each time slice. (default: False)",
+)
+flags.DEFINE_integer(
+    "finer_discretization_window",
+    5,
+    "The window around previous solution that keeps discretization of 1.",
+)
+flags.DEFINE_bool(
+    "scheduler_selective_rescheduling",
+    False,
+    "If `True`, the supported schedulers will follow some pre-defined strategies for "
+    "selectively sampling TaskGraphs to reschedule.",
+)
+flags.DEFINE_integer(
+    "scheduler_plan_ahead_no_consideration_gap",
+    4,
+    "The length of time gap (in µs) for which the reconsiderations are frozen. "
+    "From the current time to the consideration gap, any tasks placed will not be "
+    "reconsidered for rescheduling.",
+)
+flags.DEFINE_list(
+    "scheduler_log_times",
+    [],
+    "A list of timestamps (in µs) at which to request extra logging from the Scheduler."
+    "If scheduler_log_to_file is `True`, then extra information will be requested for "
+    "all timestamps.",
+)
+flags.DEFINE_integer(
+    "scheduler_selective_rescheduling_sample_size",
+    5,
+    "If `scheduler_selective_rescheduling` is True, then this flag defines the number "
+    "of TaskGraphs to sample for rescheduling.",
+)
+flags.DEFINE_integer(
+    "min_task_graph_deadline_variance",
+    10,
+    "The MIN percentage (additive) factor to be used with critical path length of the task graph. "
+    "This helps inform the deadline for the taskgraph and all tasks within the task "
+    "graph. The value be > 0 since the taskgraph would take atleast the critical path "
+    "time duration to complete.",
+)
+flags.DEFINE_integer(
+    "max_task_graph_deadline_variance",
+    25,
+    "The MAX percentage (additive) factor to be used with critical path length of the task graph. "
+    "This helps inform the deadline for the taskgraph and all tasks within the task "
+    "graph. The value be > min_task_graph_deadline_variance since deadline is decided based on it.",
+)
+flags.DEFINE_bool(
+    "uniformly_sample_task_slots",
+    False,
+    "Enabling this ignores the TPCH profiled taskslots and uses a seeded, rng gerenated "
+    "num_tasks (= num_slots) for different stages of the TPCH job, uniformly sampled "
+    "in a range.",
+)
+flags.DEFINE_integer(
+    "random_seed",
+    random.randint(0, sys.maxsize),
+    "The seed to be used for random number generation. Defaults to a random number.",
+)
+
+# Define an item containing completion timestamp and task
+class TimedItem:
+    _next_id = 0
+    _id_threshold = 99999
+
+    def __init__(self, timestamp, task):
+        self.timestamp = timestamp
+        self.task = task
+        self.id = TimedItem._next_id
+        TimedItem._next_id += 1
+
+        # Reset _next_id if it crosses the threshold
+        # We keep _next_id bounded to avoid very large numbers
+        # which could lead to slightly slower comparions
+        if TimedItem._next_id > TimedItem._id_threshold:
+            TimedItem._next_id = 0
+
+    def __lt__(self, other):
+        """Less than comparison for TimedItem instances."""
+        if self.timestamp == other.timestamp:
+            # Unique ID for each TimedItem acts as tie-breaker
+            # for inserting into PriorityQueue
+            return self.id < other.id
+        return self.timestamp < other.timestamp
+
+    def __eq__(self, other):
+        """Equality comparison for TimedItem instances."""
+        return self.timestamp == other.timestamp and self.id == other.id
+
+
+# Define a priority queue based on heapq module
+class PriorityQueue:
+    def __init__(self):
+        self._queue = []
+
+    def put(self, item):
+        heapq.heappush(self._queue, (item.timestamp, item))
+
+    def get(self):
+        _, item = heapq.heappop(self._queue)
+        return item
+
+    def empty(self):
+        return len(self._queue) == 0
+
+
+# Implement the service.
+class SchedulerServiceServicer(erdos_scheduler_pb2_grpc.SchedulerServiceServicer):
+    def __init__(self) -> None:
+        """Initialize the service, and setup the logger."""        
+        # Values used by the Servicer.
+        self._logger = setup_logging(
+            name=__name__,
+            log_dir=FLAGS.log_dir,
+            log_file=FLAGS.log_file_name,
+            log_level=FLAGS.log_level
+        )
+        self._initialized = False
+        self._initialization_time = -1
+        self._last_step_up_time = EventTime.zero()
+        self._master_uri = None
+
+        # The simulator types maintained by the Servicer.
+        self._worker_pool = None
+        self._worker_pools = None
+        self._drivers: Mapping[str, Task] = {}
+        self._workload = None
+
+        # Track taskgraph completion progress.
+        self._total_taskgraphs_registered = 0
+        self._total_taskgraphs_missed = 0
+        self._total_taskgraphs_met = 0
+        self._total_taskgraphs_cancelled = 0
+        self._cancelled_taskgraphs = set()
+        self._min_task_graph_deadline_variance = FLAGS.min_task_graph_deadline_variance
+        self._max_task_graph_deadline_variance = FLAGS.max_task_graph_deadline_variance
+
+        # Setting a rng for future use
+        self._rng = random.Random(FLAGS.random_seed)
+
+        # Scheduler information maintained by the servicer.
+        self._scheduler_running_lock = asyncio.Lock()
+        self._scheduler_running = False
+        self._rerun_scheduler = False
+        self._scheduler_is_task_type = False
+        if FLAGS.scheduler == "EDF":
+            self._scheduler = EDFScheduler(
+                enforce_deadlines=FLAGS.enforce_deadlines,
+                _flags=FLAGS,
+            )
+            self._scheduler_is_task_type = True
+        elif FLAGS.scheduler == "FIFO":
+            # NOTE: FIFO is supposed to be run as deadline unaware
+            self._scheduler = FIFOScheduler(
+                enforce_deadlines=FLAGS.enforce_deadlines,
+                _flags=FLAGS,
+            )
+            self._scheduler_is_task_type = True
+        elif FLAGS.scheduler == "DAGSched":
+            # --scheduler=TetriSched
+            # --release_taskgraphs
+            # --enforce_deadlines
+            # --scheduler_time_discretization=1 ====> Conv to EventTime & passed through diff arg name
+            # --scheduler_enable_optimization_pass ====> Passed through _flags
+            # --retract_schedules
+            # --scheduler_dynamic_discretization ====> Passed through different argument name
+            # --scheduler_max_time_discretization=8 ====> Conv to EventTime & passed through diff arg name
+            # --scheduler_max_occupancy_threshold=0.999 ====> Passed through different argument name
+            # --finer_discretization_at_prev_solution
+            # --finer_discretization_window=4
+            # --scheduler_selective_rescheduling (DISABLE) ====> Passed through _flags
+            # --scheduler_reconsideration_period=0.99 ====> Passed through _flags
+
+            self._scheduler = TetriSchedScheduler(
+                release_taskgraphs=FLAGS.release_taskgraphs,
+                time_discretization=EventTime(
+                    FLAGS.scheduler_time_discretization, EventTime.Unit.US
+                ),
+                _flags=FLAGS,
+                max_time_discretization=EventTime(
+                    FLAGS.scheduler_max_time_discretization, EventTime.Unit.US
+                    ),
+                enforce_deadlines=FLAGS.enforce_deadlines,
+                dynamic_discretization=FLAGS.scheduler_dynamic_discretization,
+                max_occupancy_threshold=FLAGS.scheduler_max_occupancy_threshold,
+                retract_schedules=FLAGS.retract_schedules,
+                finer_discretization_at_prev_solution=(
+                    FLAGS.finer_discretization_at_prev_solution
+                ),
+                finer_discretization_window=EventTime(
+                    FLAGS.finer_discretization_window, EventTime.Unit.US
+                    ),
+                plan_ahead_no_consideration_gap=EventTime(
+                    FLAGS.scheduler_plan_ahead_no_consideration_gap, EventTime.Unit.US
+                    ),
+                log_to_file=True,
+            )
+            self._scheduler_is_task_type = not FLAGS.release_taskgraphs
+        else:
+            raise ValueError(f"Unknown scheduler {FLAGS.scheduler}.")
+
+        # Placement information maintained by the servicer.
+        # The placements map the application IDs to the Placement retrieved from the
+        # scheduler.
+        # NOTE: (DG) This is a new nested dict implementation.
+        # First level of dict is a mapping from app-id to all tasks in that app-id
+        # Second level of dict is a mapping from tasks to exact placement.
+        # TODO: (DG) This will no longer be ordered by time, so the check needs to be
+        # done for all tasks? Also, we might need to delete the placement once executed?
+        self._placements: Dict[str, Dict[str, Placement]] = {}
+
+        # _executed_placements keep a track of previously completed placements since
+        # placements are deleted after being released. Can be used for debugging.
+        self._executed_placements: Dict[str, Placement] = {}
+
+        # Additional task information maintained by the servicer
+        self._tasks_marked_for_completion = PriorityQueue()
+
+        # Start the asyncio loop for clearing out pending tasks for completion
+        asyncio.create_task(self.PopTasksBasedOnTime())
+
+        super().__init__()
+
+    async def schedule(self) -> None:
+        """Schedules the tasks that have been added to the Workload."""
+        current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time
+
+        async with self._scheduler_running_lock:
+            if self._scheduler_running:
+                self._logger.error(
+                    "[%s] Scheduler already running, this should never be reached.",
+                    current_time,
+                )
+                return
+            self._scheduler_running = True
+
+        self._logger.info(
+            "[%s] Starting a scheduling cycle with %s TaskGraphs and %s Workers.",
+            current_time,
+            len(self._workload.task_graphs),
+            len(self._worker_pool.workers),
+        )
+
+        # Cumulate the resources from all the WorkerPools
+        for worker_pool in self._worker_pools.worker_pools:
+            worker_pool_resources = worker_pool.resources
+            for resource_name in set(
+                map(lambda value: value[0].name, worker_pool_resources.resources)
+            ):
+                resource = Resource(name=resource_name, _id="any")
+                self._logger.info(
+                    f"{current_time},WORKER_POOL_UTILIZATION,{worker_pool.id},"
+                    f"{resource_name},"
+                    f"{worker_pool_resources.get_allocated_quantity(resource)},"
+                    f"{worker_pool_resources.get_available_quantity(resource)}"
+                )
+        
+        # Perform worker pool step
+        self._logger.info(
+            "[%s] Need to perform a step before schedule().",
+            current_time,
+            )
+        completed_tasks = self.PerformWorkerPoolStep(sim_time=current_time)
+        
+        # Finish all tasks that have now completed
+        for completed_task in completed_tasks:
+            self.CleanupTaskExecution(
+                task=completed_task,
+                sim_time=current_time
+                )
+        
+
+        # TODO (Sukrit): Change this to a better implementation.
+        # Let's do some simple scheduling for now, that gives a fixed number of
+        # executors to all the available applications in intervals of 10 seconds.
+        if len(self._workload.task_graphs) >= 1:
+            scheduler_placements = self._scheduler.schedule(
+                sim_time=EventTime(current_time.time, EventTime.Unit.US),
+                workload=self._workload,
+                worker_pools=self._worker_pools,
+            )
+            
+            # Filter the scheduler_placements that are now in CANCEL_TASK state.
+            cancel_task_placements = list(filter(
+                lambda p: p.placement_type == Placement.PlacementType.CANCEL_TASK,
+                scheduler_placements,
+            ))
+            self._logger.info(
+                "[%s] Received %s tasks to be cancelled: %s.",
+                current_time,
+                len(cancel_task_placements),
+                cancel_task_placements,
+            )
+            # Issue task cancellations for identified tasks and taskgraphs so that
+            # the taskgraphs are no longer in consideration
+            for placement in cancel_task_placements:
+                # Update the task placement decision so that we can stop
+                # responding to RPC calls from its driver based on CANCEL_TASK type
+
+                if placement.task.task_graph not in self._placements:
+                    self._placements[placement.task.task_graph] = {}
+                    self._logger.warning(
+                        "[%s] Came to cancel a placement but taskgraph %s was not in "
+                        "self._placements. Creating an empty dict entry.",
+                        current_time,
+                        placement.task.task_graph,
+                    )
+                self._placements[placement.task.task_graph][placement.task] = placement
+                self._logger.info(
+                        "[%s] Added cancel placement to taskgraph %s for task %s. "
+                        "Placement: %s",
+                        current_time,
+                        placement.task.task_graph,
+                        placement.task,
+                        placement,
+                    )
+
+                # Since even one task getting cancelled, implies task-graph
+                # cancellation, we add the task-graph to cancelled set
+                if placement.task.task_graph not in self._cancelled_taskgraphs:
+                    self._cancelled_taskgraphs.add(placement.task.task_graph)
+                    self._total_taskgraphs_cancelled += 1
+                    self._logger.info(
+                            "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s",
+                            current_time,
+                            self._total_taskgraphs_registered,
+                            self._total_taskgraphs_met,
+                            self._total_taskgraphs_missed,
+                            self._total_taskgraphs_cancelled,
+                            )
+                
+                self._logger.info(
+                    "[%s] Cancelling task: %s from taskgraph: %s",
+                    current_time,
+                    placement.task.name,
+                    placement.task.task_graph,
+                )
+                # Sending tasks to cancel.
+                placement.task.cancel(
+                    time=current_time,
+                )
+
+                # Also cancel the task-graph so that all dependent tasks are removed
+                task_graph = self._workload.get_task_graph(placement.task.task_graph)
+                if task_graph is None:
+                    self._logger.error("[%s] No TaskGraph found for %s",
+                                       current_time,
+                                       placement.task.task_graph,
+                                       )
+                
+                for cancelled_task in task_graph.cancel(placement.task, current_time):
+                    self._logger.info(
+                        "[%s] Further cancelling dependent task: %s from taskgraph: %s",
+                        current_time,
+                        placement.task.name,
+                        placement.task.task_graph,
+                    )
+                    
+                    cancelled_task.cancel(
+                        time=current_time,
+                    )
+
+                # TODO: (DG): Ensure that task-graph is removed from the workload and
+                # doesn't show up in the next iteration of tetrisched scheduler?
+            
+            # Filter the scheduler_placements that are not of type PLACE_TASK and 
+            # have not been placed.
+            filtered_placements = filter(
+                lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK
+                and p.is_placed(),
+                scheduler_placements,
+            )
+            for placement in sorted(
+                filtered_placements, key=attrgetter("placement_time")
+            ):
+                if placement.task.task_graph not in self._placements:
+                    self._placements[placement.task.task_graph] = {}
+                    self._logger.info(
+                        "[%s] Want to add a placement but taskgraph %s was not in "
+                        "self._placements. Creating an empty dict entry.",
+                        current_time,
+                        placement.task.task_graph,
+                    )
+                if placement.task not in self._placements[placement.task.task_graph]:
+                    self._logger.info(
+                        "[%s] Adding new placement to taskgraph %s for task %s. "
+                        "Placement: %s",
+                        current_time,
+                        placement.task.task_graph,
+                        placement.task,
+                        placement,
+                    )
+                else:
+                    self._logger.info(
+                        "[%s] Updating an existing placement in taskgraph %s for task %s. "
+                        "Placement: %s",
+                        current_time,
+                        placement.task.task_graph,
+                        placement.task,
+                        placement,
+                    )
+                self._placements[placement.task.task_graph][placement.task] = placement
+                
+                # Schedule the task here since marking it as running requires it to be
+                # scheduled before. We mark it to be running when we inform the
+                # framework of the placement.
+
+                # TODO: (DG) ASK - dont think tasks need to be marked as unscheduled on cancellation?
+                placement.task.schedule(
+                    time=placement.placement_time,
+                    placement=placement,
+                )
+
+            
+            # Handle task placements that have returned with unplaced tasks
+            unplaced_placements = filter(
+                lambda p: p.placement_type == Placement.PlacementType.PLACE_TASK
+                and not p.is_placed(),
+                scheduler_placements,
+            )
+            for placement in unplaced_placements:
+                if placement.task.task_graph not in self._placements:
+                    self._logger.info(
+                        "[%s] Taskgraph %s not found for task %s, couldn't invalidate "
+                        "it or it was previously invalidated.",
+                        current_time,
+                        placement.task.task_graph,
+                        placement.task,
+                    )
+                elif placement.task in self._placements[placement.task.task_graph]:
+                    self._logger.info(
+                        "[%s] Invalidated the placement (taskgraph %s and task %s)"
+                        "from self._placements along with entire taskgraph.",
+                        current_time,
+                        placement.task.task_graph,
+                        placement.task,
+                    )
+                    for task in self._placements[placement.task.task_graph]:
+                        self._logger.info(
+                            "[%s] Invalidating the placement for task %s "
+                            "from self._placements due to invalidation of %s.",
+                            current_time,
+                            task,
+                            placement.task,
+                        )
+                        # Unschedule the task
+                        if task.state is TaskState.SCHEDULED:
+                            task.unschedule(time=current_time)
+                        else:
+                            self._logger.warning(
+                                "[%s] Could not unschedule since task %s was "
+                                "found in state %s in during invalidation of %s.",
+                                current_time,
+                                task,
+                                task.state,
+                                placement.task,
+                            )
+                    # delete the taskgraph at once since we cant change size
+                    # of dict while iterating
+                    del self._placements[placement.task.task_graph]
+                else:
+                    self._logger.info(
+                        "[%s] Couldn't invalidate placement (taskgraph %s and task %s)."
+                        "It couldnt be found in self._placements.",
+                        current_time,
+                        placement.task.task_graph,
+                        placement.task,
+                    )
+
+        scheduler_end_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time
+        self._logger.info(
+            "[%s] Finished the scheduling cycle initiated at %s.",
+            scheduler_end_time,
+            current_time,
+        )
+
+        # Check if another run of the Scheduler has been requested, and if so, create
+        # a task for it. Otherwise, mark the scheduler as not running.
+        async with self._scheduler_running_lock:
+            self._scheduler_running = False
+            self._logger.info("[%s] self._rerun_scheduler: %s.",
+                              scheduler_end_time,
+                              self._rerun_scheduler,
+                              )
+            if self._rerun_scheduler:
+                self._rerun_scheduler = False
+                asyncio.create_task(self.schedule())
+
+    async def run_scheduler(self) -> None:
+        """Checks if the scheduler is running, and if not, starts it.
+
+        If the scheduler is already running, we queue up another execution of the
+        scheduler. This execution batches the scheduling requests, and runs the
+        scheduler only once for all the requests."""
+        async with self._scheduler_running_lock:
+            if not self._scheduler_running:
+                asyncio.create_task(self.schedule())
+            else:
+                self._rerun_scheduler = True
+
+    async def RegisterFramework(self, request, context):
+        """Registers a new framework with the backend scheduler.
+        This is the entry point for a new instance of Spark / Flink to register
+        itself with the backend scheduler, and is intended as an EHLO.
+        """
+        if self._initialized:
+            self._logger.warning(
+                "Framework already registered at %s with the address %s at %s",
+                self._initialization_time,
+                self._master_uri,
+                self._initialization_time,
+            )
+            return erdos_scheduler_pb2.RegisterFrameworkResponse(
+                success=False,
+                message=f"Framework already registered at "
+                f"{self._initialization_time} at the address {self._master_uri}",
+
+        # Setup a new Framework instance.
+        framework_name = request.name
+        self._master_uri = request.uri
+        self._initialization_time = EventTime(request.timestamp, EventTime.Unit.US)
+        self._initialized = True
+        self._logger.info(
+            "[%s] Registering framework %s with URI %s.",
+            self._initialization_time,
+            framework_name,
+            self._master_uri,
+        )
+
+        # Setup the simulator types.
+        parsed_uri = urlparse(self._master_uri)
+        self._worker_pool = WorkerPool(
+            name=f"WorkerPool_{parsed_uri.netloc}",
+            _logger=self._logger
+            )
+        self._worker_pools = WorkerPools(worker_pools=[self._worker_pool])
+        self._workload = Workload.from_task_graphs({})
+
+        # Return the response.
+        return erdos_scheduler_pb2.RegisterFrameworkResponse(
+            success=True,
+            message=f"{framework_name} at {self._master_uri} registered successfully!",
+        )
+
+    async def RegisterDriver(self, request, context):
+        sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+        
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to register a driver with name %s and id %s, "
+                "but no framework is registered yet.",
+                sim_time,
+                request.name,
+                request.id,
+            )
+            return erdos_scheduler_pb2.RegisterDriverResponse(
+                success=False,
+                message="Framework not registered yet.",
+                worker_id="",
+            )
+
+        # Create a Task for the Driver, and add it to the list of drivers.
+        # TODO (Sukrit): We drop the memory requirements for now, we should use
+        # them to do multi-dimensional packing using STRL.
+        self._logger.info(
+            "[%s] Received a request to register a driver with name %s, URI: %s. "
+            "The driver requires %s cores and %s memory.",
+            sim_time,
+            request.id,
+            request.uri,
+            request.cores,
+            request.memory,
+        )
+        driver_resources = Resources(
+            resource_vector={Resource(name="Slot_CPU", _id="any"): 1},
+            _logger=self._logger,
+        )
+        driver_job = Job(
+            name=request.id,
+            profile=WorkProfile(
+                name=f"WorkProfile_{request.id}",
+                execution_strategies=ExecutionStrategies(
+                    [
+                        ExecutionStrategy(
+                            resources=driver_resources,
+                            batch_size=1,
+                            # NOTE (Sukrit): Drivers are long running, and have no
+                            # fixed runtime. Setting it to zero helps us unload the
+                            # driver from the Worker whenever we need it.
+                            runtime=EventTime.zero(),
+                        )
+                    ]
+                ),
+            ),
+        )
+        driver = Task(
+            name=request.id,
+            task_graph=request.uri,
+            job=driver_job,
+            deadline=EventTime.invalid(),
+            _logger=self._logger,
+        )
+        self._drivers[request.id] = driver
+
+        # Iterate over the Workers and find a Worker that can accomodate the driver.
+        placement_found = False
+        for worker in self._worker_pool.workers:
+            for execution_strategy in driver.available_execution_strategies:
+                if worker.can_accomodate_strategy(execution_strategy):
+                    # This Worker can accomodate the Driver, we assign it here.
+                    placement_found = True
+                    # self._worker_pool.place_task(driver, execution_strategy, worker.id)
+
+                    # Update the Task's state and placement information.
+                    placement_time = sim_time
+                    driver.schedule(
+                        time=placement_time,
+                        placement=Placement(
+                            type=Placement.PlacementType.PLACE_TASK,
+                            computation=driver,
+                            placement_time=placement_time,
+                            worker_pool_id=self._worker_pool.id,
+                            worker_id=worker.id,
+                            strategy=execution_strategy,
+                        ),
+                    )
+                    driver.start(placement_time)
+
+                    # Tell the framework to start the driver.
+                    return erdos_scheduler_pb2.RegisterDriverResponse(
+                        success=True,
+                        message=f"[{sim_time}] Driver {request.id} registered successfully!",
+                        worker_id=worker.name,
+                    )
+
+        if not placement_found:
+            return erdos_scheduler_pb2.RegisterDriverResponse(
+                success=False,
+                message=f"[{sim_time}] No Worker can accomodate the driver {request.id} yet.",
+                worker_id="",
+            )
+
+    async def DeregisterDriver(self, request, context):
+        completion_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+        
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to deregister a driver with id %s, "
+                "but no framework is registered yet.",
+                completion_time,
+                request.id,
+            )
+            return erdos_scheduler_pb2.DeregisterDriverResponse(
+                success=False, message="Framework not registered yet."
+            )
+
+        if request.id not in self._drivers:
+            self._logger.warning(
+                "[%s] Trying to deregister a driver with id %s, "
+                "but no driver with that id is registered.",
+                completion_time,
+                request.id,
+            )
+            return erdos_scheduler_pb2.DeregisterDriverResponse(
+                success=False,
+                message=f"[{completion_time}] Driver with id {request.id} not registered yet.",
+            )
+
+        # Deregister the driver.
+        driver = self._drivers[request.id]
+        # self._worker_pool.remove_task(completion_time, driver)
+        driver.finish(completion_time)
+        del self._drivers[request.id]
+        return erdos_scheduler_pb2.DeregisterDriverResponse(
+            success=True,
+            message=f"[{completion_time}] Driver with id {request.id} deregistered successfully!",
+        )
+
+    async def RegisterTaskGraph(self, request, context):
+        """Registers a new TaskGraph with the backend scheduler.
+        This is the entry point for a new application of Spark to register
+        itself with the backend scheduler, and is intended as an EHLO.
+        """
+        sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to register a task graph with ID %s and name %s, "
+                "but no framework is registered yet.",
+                sim_time,
+                request.id,
+                request.name,
+            )
+            return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+                success=False, message="Framework not registered yet.", num_executors=0
+            )
+
+        if request.id in self._workload.task_graphs:
+            self._logger.warning(
+                "[%s] The application with ID %s and name %s was already registered.",
+                sim_time,
+                request.id,
+                request.name,
+            )
+            return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+                success=False,
+                message=f"[{sim_time}] Application ID {request.id} with name "
+                f"{request.name} already registered!",
+                num_executors=0,
+            )
+
+        self._logger.info(
+            "[%s] Attempting to register application ID %s with name %s",
+            sim_time,
+            request.id,
+            request.name,
+        )
+        # Check if query is from TPC-H workload.
+        # If yes, retrieve profiled slots and runtime info. If no, use default values
+        is_tpch_query = False
+        tpch_query_all_stage_info = None
+        if request.name.startswith("TPCH Query"):
+            is_tpch_query = True
+            # retrieve tasks-per-stage and runtime info based on query specifications
+            # Split the string by spaces
+            query_parts = request.name.split()
+            
+            # Initialize dataset_size and max_executor variables with default
+            tpch_query_num = None
+            tpch_dataset_size = int(FLAGS.tpch_dataset_size)
+            tpch_max_executors_per_job = int(FLAGS.tpch_max_executors_per_job)
+            
+            # Check if the string has the required format
+            # Format 1: "TPCH Query <queryNum>"
+            # Format 2: "TPCH Query <queryNum> <datasetSize> <maxExecutors>"
+            if len(query_parts) >= 3 and query_parts[0] == "TPCH" and query_parts[1] == "Query":
+                tpch_query_num = int(query_parts[2])
+                
+                # If dataset size and max cores are provided
+                if len(query_parts) >= 5:
+                    tpch_dataset_size = int(query_parts[3])
+                    tpch_max_executors_per_job = int(query_parts[4])
+
+            tpch_query_all_stage_info = get_all_stage_info_for_query(
+                query_num=tpch_query_num,
+                profile_type=FLAGS.tpch_profile_type,
+                dataset_size=tpch_dataset_size,
+                max_executors=tpch_max_executors_per_job)
+            
+            same_structure, stage_id_mapping = verify_and_relable_tpch_app_graph(
+                query_num=tpch_query_num, dependencies=request.dependencies
+            )
+
+            # return failure message if not tpch app isnt of same DAG structure
+            if not same_structure:
+                self._logger.warning(
+                    "[%s] TPCH application with ID %s and name %s couldn't be registered."
+                    "DAG structure mismatch!",
+                    sim_time,
+                    request.id,
+                    request.name,
+                )
+                return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+                    success=False,
+                    message=f"[{sim_time}] TPCH application ID {request.id} with name {request.name}"
+                    f" couldn't be registered. DAG structure mismatch!",
+                    num_executors=0,
+                )
+
+        # Construct all the Tasks for the TaskGraph.
+        task_ids_to_task: Mapping[int, Task] = {}
+        default_resource = Resources(
+            resource_vector={Resource(name="Slot_CPU", _id="any"): 20},
+            _logger=self._logger,
+        )
+        default_runtime = EventTime(
+            math.ceil(20 * FLAGS.spark_task_duration_multiplier),
+            EventTime.Unit.US
+            )
+
+        for i, task_dependency in enumerate(request.dependencies):
+            framework_task = task_dependency.key
+            if is_tpch_query:
+                mapped_stage_id = stage_id_mapping[framework_task.id]
+                
+                # NOTE: task_slots and task_runtime given to scheduler might be updated
+                # based on tpch_max_executors_per_job. If task_slots >
+                # tpch_max_executors_per_job, we transform (task_slots * task_runtime)
+                # as tpch_max_executors_per_job * (
+                #           (task_slots * task_runtime)/tpch_max_executors_per_job
+                # )
+                # TODO: (DG) It is not foolproof since scheduler can give more than
+                # tpch_max_executors_per_job to app if it decides to run multiple
+                # independent stages in parallel
+
+                profiled_task_slots = (
+                    tpch_query_all_stage_info[mapped_stage_id]["num_tasks"]
+                    )
+                
+                # Profiled runtime (in ms) * duration_multiplier is converted
+                # to nearest second
+                profiled_task_runtime = math.ceil(
+                    (
+                        tpch_query_all_stage_info[mapped_stage_id]
+                        ["avg_task_duration_ms"]/1000
+                        ) * FLAGS.spark_task_duration_multiplier
+                        )
+                
+                if FLAGS.uniformly_sample_task_slots:
+                    # Chosen to override profiled tasks slots for TPCH
+                    # TODO: (DG) The (20,60) range is outside default max_executors
+                    # set to 50. Need to update code to correctly use max_executors later
+                    # TODO: (DG) Don't like that seed is now going to change the dag structure
+                    # everytime a new app arrives in the workload.
+                    # Induces variability but seems weird.
+                    # NOTE: tpch_max_ececutors is 50 but we will sample upto 70.
+                    task_slots = self._rng.randint(30, 70)
+                else:
+                    task_slots = (profiled_task_slots
+                                  if profiled_task_slots <= tpch_max_executors_per_job
+                                  else tpch_max_executors_per_job
+                                  )
+                
+                # TODO: (DG) Adjust runtime if using uniformly_sample_task_slots
+                # Currently, runtimes still being calculated based on profiled_task_slots
+                # Setting minimum task_runtime to 8s to allow stages to complete
+                task_runtime = max(8, (
+                    profiled_task_runtime
+                    if profiled_task_slots <= tpch_max_executors_per_job
+                    else math.ceil(
+                        (profiled_task_slots *
+                         profiled_task_runtime)/tpch_max_executors_per_job)
+                    )
+                )
+                if profiled_task_slots > tpch_max_executors_per_job:
+                    self._logger.info(
+                        "[%s] Profiled slots > tpch_max_executors_per_job: %s. Converted "
+                        "(slots,runtime) from (%s,%s) to (%s, %s)",
+                        sim_time,
+                        tpch_max_executors_per_job,
+                        profiled_task_slots,
+                        profiled_task_runtime,
+                        task_slots,
+                        task_runtime,
+                        )
+                    
+                self._logger.info(
+                    "[%s] Creating Task for given app TPCH stage: %s, mapped to "
+                    "original stage id %s, with tasks: %s and avg runtime (s): %s. "
+                    "Used multiplier: %s",
+                    sim_time,
+                    framework_task.id,
+                    mapped_stage_id,
+                    task_slots,
+                    task_runtime,
+                    FLAGS.spark_task_duration_multiplier,
+                )
+            task_ids_to_task[framework_task.id] = Task(
+                name=f"task_{framework_task.name}_{i}",
+                task_graph=request.id,
+                job=Job(
+                    name=f"job_{framework_task.name}_{i}",
+                    profile=WorkProfile(
+                        name=f"WorkProfile_{framework_task.name}",
+                        execution_strategies=ExecutionStrategies(
+                            [
+                                ExecutionStrategy(
+                                    resources=(
+                                        default_resource
+                                        if not is_tpch_query
+                                        else Resources(
+                                            resource_vector={
+                                                Resource(
+                                                    name="Slot_CPU", _id="any"
+                                                ): task_slots
+                                            },
+                                            _logger=self._logger,
+                                        )
+                                    ),
+                                    batch_size=1,
+                                    runtime=(
+                                        default_runtime
+                                        if not is_tpch_query
+                                        else EventTime(task_runtime, EventTime.Unit.US)
+                                    ),
+                                )
+                            ]
+                        ),
+                    ),
+                ),
+                # NOTE: (DG) Removed setting deadline here and will set deadline 
+                # based on taskgraphs critical path instead.
+                deadline=EventTime.invalid(),
+                
+                # TODO (Sukrit): We should maintain a counter for each application
+                # type so that we can correlate the Tasks with a particular invocation.
+                timestamp=1,
+                _logger=self._logger,
+            )
+            # NOTE (Sukrit): We maintain the StageID of the Task as a separate field
+            # that is not accessible / used by the Simulator.
+            task_ids_to_task[framework_task.id].stage_id = framework_task.id
+            self._logger.info(
+                "[%s] Constructed Task %s for the TaskGraph %s.",
+                sim_time,
+                framework_task.name,
+                request.id,
+            )
+
+        # Construct the TaskGraph from the Tasks.
+        task_graph_structure: Mapping[Task, Sequence[Task]] = {}
+        for task_dependency in request.dependencies:
+            task_graph_structure[task_ids_to_task[task_dependency.key.id]] = [
+                task_ids_to_task[task_id] for task_id in task_dependency.children_ids
+            ]
+        task_graph = TaskGraph(
+            name=request.id,
+            tasks=task_graph_structure,
+        )
+
+        # Calculating critical path time from task graph
+        critical_path = task_graph.get_longest_path(
+                weights=lambda task: (task.slowest_execution_strategy.runtime.time)
+            )
+        critical_path_time = (
+            sum(
+                [t.slowest_execution_strategy.runtime for t in critical_path],
+                start=EventTime.zero(),
+            )
+            .to(EventTime.Unit.US)
+            .time
+        )
+
+        # Setting taskgraph and task deadlines using critical_path_time * deadline_variance_factor
+        deadline_variance_factor = 1.0 + (
+            self._rng.randint(
+                self._min_task_graph_deadline_variance,
+                self._max_task_graph_deadline_variance
+                )
+        )/100
+        task_graph_slo_time = math.ceil(
+            critical_path_time * deadline_variance_factor
+            )
+        
+        for task in task_graph.get_nodes():
+            deadline = EventTime(sim_time.time + task_graph_slo_time,
+                                 unit=EventTime.Unit.US
+                                 )
+            task.update_deadline(deadline)
+
+        task_graph.to_dot(f"{request.id}.dot")
+        self._workload.add_task_graph(task_graph)
+        self._logger.info(
+            "[%s] Added the TaskGraph(name=%s, id=%s, deadline=%s, "
+            "critical_path_time = %s, task_graph_slo_time = %s, "
+            "deadline_variance_factor= %s) to the Workload.",
+            sim_time,
+            request.name,
+            request.id,
+            task_graph.deadline,
+            critical_path_time,
+            task_graph_slo_time,
+            deadline_variance_factor,
+        )
+        self._logger.info(
+            "[%s] The structure of the TaskGraph %s is \n%s.",
+            sim_time,
+            request.id,
+            str(task_graph),
+        )
+
+        # Increment total number of taskgraphs registered.
+        self._total_taskgraphs_registered += 1
+
+        # Show current run statistics.
+        self._logger.info(
+            "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s",
+            sim_time,
+            self._total_taskgraphs_registered,
+            self._total_taskgraphs_met,
+            self._total_taskgraphs_missed,
+            self._total_taskgraphs_cancelled,
+            )
+        
+        # Return the response.
+        # TODO: (DG) Might want to change the number of initial executors if it causes
+        # issues in scaled up expts
+        return erdos_scheduler_pb2.RegisterTaskGraphResponse(
+            success=True,
+            message=f"[{sim_time}] Application ID {request.id} with name "
+            f"{request.name} and deadline {task_graph.deadline} registered successfully!",
+            num_executors=FLAGS.initial_executors,
+        )
+
+    async def RegisterEnvironmentReady(self, request, context):
+        """Registers that the environment (i.e., executors) are ready for the given
+        TaskGraph at the specified time.
+
+        This is intended to release the sources of the TaskGraph to the scheduling
+        backend, to consider the application in this scheduling cycle.
+        """
+        sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to register that the environment is ready for the TaskGraph "
+                "with ID %s, but no framework is registered yet.",
+                sim_time,
+                request.id,
+            )
+            return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
+                success=False, message=f"[{sim_time}] Framework not registered yet."
+            )
+        
+        task_graph = self._workload.get_task_graph(request.id)
+        if task_graph is None:
+            self._logger.warning(
+                "[%s] Trying to register that the environment is ready for the TaskGraph "
+                "with ID %s, but no TaskGraph with that ID is registered.",
+                sim_time,
+                request.id,
+            )
+            return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
+                success=False,
+                message=f"[{sim_time}] TaskGraph with ID {request.id} not registered yet.",
+            )
+
+        if request.num_executors != FLAGS.initial_executors:
+            self._logger.warning(
+                "[%s] The TaskGraph %s requires %s executors, but the environment is ready "
+                "with %s executors.",
+                sim_time,
+                request.id,
+                FLAGS.initial_executors,
+                request.num_executors,
+            )
+            return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
+                success=False,
+                message=f"Number of executors not {FLAGS.initial_executors}.",
+            )
+
+        # Release all the sources of the TaskGraph at the given time.
+        for source_task in task_graph.get_source_tasks():
+            source_task.release(sim_time)
+
+        self._logger.info(f"[{sim_time}] Environment ready for TaskGraph with ID {request.id}!")
+
+        # Run the scheduler since the Workload has changed.
+        await self.run_scheduler()
+
+        return erdos_scheduler_pb2.RegisterEnvironmentReadyResponse(
+            success=True,
+            message=f"[{sim_time}] Environment ready for TaskGraph with ID {request.id}!",
+        )
+
+    async def DeregisterFramework(self, request, context):
+        """Deregisters the framework with the backend scheduler.
+        This is the exit point for a running instance of Spark / Flink to deregister"""
+        sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+        
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to deregister the framework at %s, "
+                "but no framework is registered yet.",
+                sim_time,
+                request.uri,
+            )
+            return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+                success=False, message=f"[{sim_time}] Framework not registered yet."
+            )
+
+        if not self._master_uri == request.uri:
+            self._logger.warning(
+                "[%s] Trying to deregister the framework at %s, "
+                "but the registered framework is at %s.",
+                sim_time,
+                request.uri,
+                self._master_uri,
+            )
+            return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+                success=False,
+                message=f"[{sim_time}] Framework not registered at {request.uri} yet.",
+            )
+
+        # Deregister the framework.
+        self._initialization_time = None
+        self._master_uri = None
+        self._initialized = False
+        self._logger.info("[%s] Deregistering framework at %s", sim_time, request.uri)
+        return erdos_scheduler_pb2.DeregisterFrameworkResponse(
+            success=True,
+            message=f"[{sim_time}] Framework at {request.uri} deregistered successfully!",
+        )
+
+    async def RegisterWorker(self, request, context):
+        """Registers a new worker with the backend scheduler."""
+        current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time
+        
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to register a worker with name %s and id %s, "
+                "but no framework is registered yet.",
+                current_time,
+                request.name,
+                request.id,
+            )
+            return erdos_scheduler_pb2.RegisterWorkerResponse(
+                success=False, message=f"[{current_time}] Framework not registered yet."
+            )
+
+        # First, we construct the Resources with the given size.
+        # TODO (Sukrit): Right now, we drop the memory requirements, we should use
+        # them to do multi-dimensional packing using STRL.
+        cpu_resource = Resource(name="Slot_CPU")
+        # TODO: (DG) Override the request.cores to avoid scaling up physical setup
+        worker_resources = Resources(
+            resource_vector={
+                cpu_resource: request.cores if not FLAGS.override_worker_cpu_count
+                else 640
+                },
+            _logger=self._logger,
+            )
+        self._logger.debug(
+            "[%s] Successfully constructed the resources for the worker %s: %s.",
+            current_time,
+            request.name,
+            worker_resources,
+        )
+
+        # Construct a new Worker instance, and add it to the WorkerPool.
+        worker = Worker(
+            name=request.id,
+            resources=worker_resources,
+            _logger=self._logger,
+        )
+        self._worker_pool.add_workers([worker])
+
+        self._logger.info(
+            "[%s] Registering worker with name %s, and resources %s.",
+            current_time,
+            worker.name,
+            worker_resources,
+        )
+
+        # Run the scheduler since the Resource set has changed, and new task graphs
+        # may become eligible to run.
+        await self.run_scheduler()
+
+        return erdos_scheduler_pb2.RegisterWorkerResponse(
+            success=True,
+            message=f"[{current_time}] Worker {request.name} registered successfully!",
+            cores=FLAGS.virtualized_cores,
+            memory=FLAGS.virtualized_memory * 1024,
+        )
+
+    async def NotifyTaskCompletion(self, request, context):
+        """Notifies the backend scheduler that a task has completed."""
+        sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+        
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to notify the backend scheduler that the task with ID %s "
+                "from application %s has completed, "
+                "but no framework is registered yet.",
+                sim_time,
+                request.task_id,
+                request.application_id,
+            )
+            return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
+                success=False, message="Framework not registered yet."
+            )
+
+        task_graph = self._workload.get_task_graph(request.application_id)
+        if task_graph is None:
+            self._logger.warning(
+                "[%s] Trying to notify the backend scheduler that the task with ID %s "
+                "from application %s has completed, but the application "
+                "was not registered with the backend yet.",
+                sim_time,
+                request.task_id,
+                request.application_id,
+            )
+            return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
+                success=False,
+                message=f"[{sim_time}] Application with ID {request.application_id} "
+                f"not registered yet.",
+            )
+
+        # Find the Task that has completed, and mark it as such.
+        matched_task = None
+        for task in task_graph.get_nodes():
+            if task.stage_id == request.task_id:
+                matched_task = task
+        if matched_task is None:
+            self._logger.warning(
+                "[%s] Trying to notify the backend scheduler that the task with ID %s "
+                "from application %s has completed, but the task "
+                "was not found in the TaskGraph.",
+                sim_time,
+                request.task_id,
+                request.application_id,
+            )
+            return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
+                success=False,
+                message=f"[{sim_time}] Task with ID {request.task_id} "
+                f"not found in TaskGraph {request.application_id}.",
+            )
+
+        # Instead of completing & removing the task immediately, check
+        # if it is actually complete or will complete in the future
+
+        # Get the actual task completion timestamp
+        # actual_task_completion_time = (
+        #     matched_task.start_time.time + matched_task.remaining_time.time
+        # )
+        actual_task_completion_time = (
+            sim_time.time + matched_task.remaining_time.time
+        )
+
+        self._logger.info(
+            "[%s] Received task for completion. task.start_time: %s ,"
+            "task.remaining_time:  %s ,  actual completion time: %s. "
+            "Task details: %s",
+            sim_time.time,
+            matched_task.start_time.time,
+            matched_task.remaining_time.time,
+            actual_task_completion_time,
+            matched_task,
+        )
+
+        if sim_time.time > actual_task_completion_time:
+            self._logger.warning(
+                "[%s] Task exceeded actual completion time by %s, "
+                "Task details: %s",
+                sim_time.time,
+                (sim_time.time - actual_task_completion_time),
+                matched_task,
+                )
+
+        # TODO DG: remaining_time assumes execution of the slowest strategy
+        # Should be updated to reflect correct remaining_time based on chosen strategy?
+
+        # Add all tasks to _tasks_marked_for_completion queue.
+        # If task has actually completed, it will be dequeued immediately
+        # Else it will be dequeued at its actual task completion time
+        self._tasks_marked_for_completion.put(
+            TimedItem(actual_task_completion_time, matched_task)
+        )
+
+        # NOTE: task.finish() and run_scheduler() invocations are postponed
+        # until it is time for the task to be actually marked as complete.
+
+        return erdos_scheduler_pb2.NotifyTaskCompletionResponse(
+            success=True,
+            message=f"Task with ID {request.task_id} marked for completion at "
+            f"{sim_time}! It will be removed on actual "
+            f"task completion time at {actual_task_completion_time}",
+        )
+
+    async def GetPlacements(self, request, context):
+        """Retrieves the placements applicable at the specified time."""
+        sim_time = EventTime(request.timestamp, EventTime.Unit.US) - self._initialization_time
+
+        if not self._initialized:
+            self._logger.warning(
+                "[%s] Trying to get placements for %s, "
+                "but no framework is registered yet.",
+                sim_time,
+                request.id,
+            )
+            return erdos_scheduler_pb2.GetPlacementsResponse(
+                success=False, message="Framework not registered yet."
+            )
+
+        if request.id not in self._placements:
+            self._logger.warning(
+                "[%s] Trying to get placements for %s, but the application "
+                "was not registered with the backend yet OR was cancelled.",
+                sim_time,
+                request.id,
+            )
+            return erdos_scheduler_pb2.GetPlacementsResponse(
+                success=False, 
+                message=f"[{sim_time}] Trying to get placements for "
+                f"{request.id}, but the application was not registered with the "
+                f"backend yet OR was cancelled."
+            )
+        
+        # Construct and return the placements.,
+        placements = []
+        
+        # Keep track of app_ids and task_names to delete after placements are issued
+        to_delete = []
+        
+        for task in self._placements[request.id].keys():
+            task_placement = self._placements[request.id][task]
+            if task.state is TaskState.CANCELLED:
+                # Task cancelled, add to list to remove from self._placements
+                to_delete.append((request.id, task))
+            else:
+                if task_placement.placement_time <= sim_time:
+                    # TODO: (DG) Due to small dataset size, each stage automatically gets
+                    # one data partition i.e. one task and one executor. But later for
+                    # large datasets, we might leverage use_profile_to_scale_executors
+                    # to modify the placement before it is sent
+                    self._logger.info(
+                        f"[{sim_time}] Going to set placement.task to run: {task_placement}"
+                    )
+
+                    # Mark the Task as RUNNING.
+                    # Right now we don't run task.start() if
+                    # task is already in RUNNING or CANCELLED state.
+                    # Only SCHEDULED -> RUNNING transition is allowed.
+                    if task.state == TaskState.SCHEDULED:
+                        try:
+                            # Initialize the task at the given placement time,
+                            # and place it on the WorkerPool.
+                            worker_pool = self._worker_pools.get_worker_pool(
+                                task_placement.worker_pool_id
+                                )
+                            assert (
+                                worker_pool is not None
+                            ), f"No WorkerPool found with ID: {task_placement.worker_pool_id}."
+                            
+                            # Display worker pool utilization before placing task
+                            # Cumulate the resources from all the WorkerPools
+                            for worker_pool in self._worker_pools.worker_pools:
+                                worker_pool_resources = worker_pool.resources
+                                for resource_name in set(
+                                    map(lambda value: value[0].name, worker_pool_resources.resources)
+                                ):
+                                    resource = Resource(name=resource_name, _id="any")
+                                    self._logger.info(
+                                        f"{sim_time},WORKER_POOL_UTILIZATION,{worker_pool.id},"
+                                        f"{resource_name},"
+                                        f"{worker_pool_resources.get_allocated_quantity(resource)},"
+                                        f"{worker_pool_resources.get_available_quantity(resource)}"
+                                    )
+                                    
+                            # Perform worker pool step
+                            self._logger.info(
+                                "[%s] Need to perform a step before place_task() for %s.",
+                                sim_time,
+                                task,
+                            )
+                            completed_tasks = self.PerformWorkerPoolStep(sim_time=sim_time)
+
+                            # Finish all tasks that have now completed
+                            for completed_task in completed_tasks:
+                                self.CleanupTaskExecution(
+                                    task=completed_task,
+                                    sim_time=sim_time
+                                    )
+                            
+                            # Place the task on the worker pool
+                            if self._scheduler_is_task_type:
+                                success = True
+                            else:
+                                success = worker_pool.place_task(
+                                    task,
+                                    execution_strategy=task_placement.execution_strategy,
+                                    worker_id=task_placement.worker_id,
+                                )
+                            if success:
+                                task.start(sim_time)
+                                self._logger.info(
+                                    "[%s] Successfully started task: %s on worker_pool: %s",
+                                    sim_time,
+                                    task,
+                                    worker_pool,
+                                )
+                                # resources = placement.execution_strategy.resources
+                                placements.append(
+                                    erdos_scheduler_pb2.Placement(
+                                        worker_id=task_placement.worker_id,
+                                        application_id=request.id,
+                                        task_id=task_placement.task.stage_id,
+                                        cores=1,
+                                    )
+                                )
+
+                                # Add to delete list for clearing placement after it has been released
+                                to_delete.append((request.id, task))
+                                self._logger.debug(
+                                    "[%s] Added tuple (%s, %s) to to_delete list.",
+                                    sim_time,
+                                    request.id,
+                                    task,
+                                )
+                                
+                                # Add task_placement to executed_placements since it is now complete
+                                self._executed_placements[task] = task_placement
+                            else:
+                                self._logger.warning(
+                                    "[%s] Could not start task: %s on worker_id: %s and execution strategy: %s",
+                                    sim_time,
+                                    task,
+                                    task_placement.worker_id,
+                                    task_placement.execution_strategy,
+                                )
+                        except ValueError as e:
+                            self._logger.error(f"[{sim_time}] start() errored for task: {task}")
+                            self._logger.error(f"[{sim_time}] Error: {e}")
+
+        # Remove issued placements from self._placements
+        for app_id, task_name in to_delete:
+            del self._placements[app_id][task_name]
+            self._logger.info(
+                "[%s] Removed placement (app_id=%s, task_name=%s) from self._placements",
+                sim_time,
+                app_id,
+                task_name,
+                )
+        
+        self._logger.info(
+            "[%s] Constructed %s placements for application with ID %s.",
+            sim_time,
+            len(placements),
+            request.id,
+        )
+
+        # Run the scheduler since the Workload has changed.
+        await self.run_scheduler()
+
+        return erdos_scheduler_pb2.GetPlacementsResponse(
+            success=True,
+            placements=placements,
+            message=f"[{sim_time}] Constructed {len(placements)} "
+            f"placements.",
+        )
+
+    # Function to pop tasks from queue based on actual completion time
+    async def PopTasksBasedOnTime(self):
+        while True:
+            if not self._tasks_marked_for_completion.empty():
+                # Get the top item from the priority queue
+                top_item = self._tasks_marked_for_completion._queue[0][1]
+
+                # Check if top item's timestamp is reached or passed by current time
+                current_time = EventTime(int(time.time()), EventTime.Unit.US) - self._initialization_time
+                if top_item.timestamp <= current_time.time:
+                    # Pop the top item
+                    popped_item = self._tasks_marked_for_completion.get()
+                    self._logger.info(
+                        "[%s] Removing task from pending completion queue. "
+                        "Task details: %s. "
+                        "Timestamp: %s",
+                        current_time,
+                        popped_item.task,
+                        top_item.timestamp,
+                    )
+
+                    # Display worker pool utilization before removing task
+                    # Cumulate the resources from all the WorkerPools
+                    for worker_pool in self._worker_pools.worker_pools:
+                        worker_pool_resources = worker_pool.resources
+                        for resource_name in set(
+                            map(lambda value: value[0].name, worker_pool_resources.resources)
+                        ):
+                            resource = Resource(name=resource_name, _id="any")
+                            self._logger.info(
+                                f"{current_time},WORKER_POOL_UTILIZATION,{worker_pool.id},"
+                                f"{resource_name},"
+                                f"{worker_pool_resources.get_allocated_quantity(resource)},"
+                                f"{worker_pool_resources.get_available_quantity(resource)}"
+                            )
+
+                    # Perform worker pool step
+                    self._logger.info(
+                        "[%s] Need to perform a step before remove_task() for %s.",
+                        current_time,
+                        popped_item.task,
+                        )
+                    completed_tasks = self.PerformWorkerPoolStep(sim_time=current_time)
+                    # TODO: (DG) For simplicity, we only pop cleanup task state for a single 
+                    # popped-item in the loop at once. Later, we could cleanup all identified
+                    # completed tasks here.
+
+                    if popped_item.task.state == TaskState.COMPLETED:
+                        # It means that the task state was already cleaned up after another
+                        # invocation of PerformWorkerPoolStep. Can skip here then.
+                        self._logger.info(
+                            "[%s] Task %s already in COMPLETED state while processing "
+                            "in PopTasksBasedOnTime.",
+                            current_time,
+                            popped_item.task,
+                        )
+                    else:
+                        self._logger.info(
+                            "[%s] PopTasksBasedOnTime invoking CleanupTaskExecution "
+                            "for task %s",
+                            current_time,
+                            popped_item.task,
+                        )
+                        self.CleanupTaskExecution(task=popped_item.task,
+                                                  sim_time=current_time)
+
+                    # # Free the resources on the worker pool for the completed task
+                    # task_placed_at_worker_pool = self._worker_pools.get_worker_pool(
+                    #     popped_item.task.worker_pool_id
+                    # )
+                    # task_placed_at_worker_pool.remove_task(
+                    #     current_time=current_time, task=popped_item.task
+                    #     )
+
+                    # # Mark the Task as completed.
+                    # # Also release the task from the scheduler service
+                    # popped_item.task.update_remaining_time(EventTime.zero())
+                    # popped_item.task.finish(current_time)
+
+                    # # TODO: (DG) Check change here
+                    # released_tasks, cancelled_tasks = self._workload.notify_task_completion(
+                    #     task=popped_item.task,
+                    #     finish_time=current_time)
+                    
+                    # # TODO: (DG) Check change here
+                    # for new_released_task in released_tasks:
+                    #     new_released_task.release(current_time)
+                    
+                    # # TODO: Might do for cancelled too
+
+                    # # Mark task graph completed
+                    # task_graph = self._workload.get_task_graph(popped_item.task.task_graph)
+                    # if task_graph is None:
+                    #     self._logger.error(f"[{current_time}] Taskgraph for task {popped_item.task} is None")
+                    #     raise RuntimeError(f"[{current_time}] Taskgraph for task {popped_item.task} is None")
+                    # if task_graph.is_complete():
+                    #     self._logger.info(f"[{current_time}] Finished task_graph {task_graph.name}")
+                    #     if task_graph.deadline < current_time:
+                    #         self._logger.info(f"[{current_time}] Missed deadline for task_graph {task_graph.name}")
+                    #         self._total_taskgraphs_missed += 1
+                    #     else:
+                    #         self._logger.info(f"[{current_time}] Met deadline for task_graph {task_graph.name}")
+                    #         self._total_taskgraphs_met += 1
+                    #     self._logger.info(
+                    #         "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s",
+                    #         current_time,
+                    #         self._total_taskgraphs_registered,
+                    #         self._total_taskgraphs_met,
+                    #         self._total_taskgraphs_missed,
+                    #         self._total_taskgraphs_cancelled,
+                    #         )
+
+                    # Run the scheduler since the Workload has changed.
+                    await self.run_scheduler()
+                
+                else:
+                    # If the top item's timestamp hasn't been reached yet,
+                    # sleep for a short duration
+                    await asyncio.sleep(0.1)  # TODO: Can adjust value, curr=0.1s
+            else:
+                # If the queue is empty, sleep for a short duration
+                await asyncio.sleep(0.1)  # TODO: Can adjust value, curr=0.1s
+
+    def PerformWorkerPoolStep(self, sim_time):
+        # Get time elapsed since last step up time
+        time_elapsed_since_last_step = (
+            sim_time - self._last_step_up_time
+        )
+        
+        # step up all tasks on the worker-pool to reflect correct remaining time
+        self._logger.info(
+            "[%s] Stepping for %s timesteps.",
+            sim_time,
+            time_elapsed_since_last_step,
+        )
+        for worker_pool in self._worker_pools.worker_pools:
+            completed_tasks = worker_pool.step(
+                self._last_step_up_time, time_elapsed_since_last_step)
+            for task in completed_tasks:
+                self._logger.info(
+                    "[%s] Task %s was now found complete.",
+                    sim_time,
+                    task,
+                )
+
+        # Update _last_step_up_time
+        self._last_step_up_time = sim_time
+
+        return completed_tasks
+    
+    def CleanupTaskExecution(self, task, sim_time):
+        self._logger.info(
+            "[%s] Cleaning up task execution for task %s.",
+            sim_time,
+            task,
+            )
+        
+        # Free the resources on the worker pool for the completed task
+        task_placed_at_worker_pool = self._worker_pools.get_worker_pool(
+            task.worker_pool_id
+        )
+        task_placed_at_worker_pool.remove_task(
+            current_time=sim_time, task=task
+            )
+
+        # Mark the Task as completed.
+        # Also release the task from the scheduler service
+        task.update_remaining_time(EventTime.zero())
+        task.finish(sim_time)
+
+        released_tasks, cancelled_tasks = self._workload.notify_task_completion(
+            task=task,
+            finish_time=sim_time)
+        
+        for new_released_task in released_tasks:
+            new_released_task.release(sim_time)
+        
+        # TODO: Might do for cancelled too
+
+        # Mark task graph completed
+        task_graph = self._workload.get_task_graph(task.task_graph)
+        if task_graph is None:
+            self._logger.error(f"[{sim_time}] Taskgraph for task {task} is None")
+            raise RuntimeError(f"[{sim_time}] Taskgraph for task {task} is None")
+        if task_graph.is_complete():
+            self._logger.info(f"[{sim_time}] Finished task_graph {task_graph.name}")
+            if task_graph.deadline < sim_time:
+                self._logger.info(f"[{sim_time}] Missed deadline for task_graph {task_graph.name}")
+                self._total_taskgraphs_missed += 1
+            else:
+                self._logger.info(f"[{sim_time}] Met deadline for task_graph {task_graph.name}")
+                self._total_taskgraphs_met += 1
+            self._logger.info(
+                "[%s] RUN_STATS (registered, met, missed, cancelled): %s, %s, %s, %s",
+                sim_time,
+                self._total_taskgraphs_registered,
+                self._total_taskgraphs_met,
+                self._total_taskgraphs_missed,
+                self._total_taskgraphs_cancelled,
+                )
+
+
+async def serve():
+    """Serves the ERDOS Scheduling RPC Server."""
+    # Initialize the server.
+    server = grpc.aio.server(futures.ThreadPoolExecutor(max_workers=FLAGS.max_workers))
+    erdos_scheduler_pb2_grpc.add_SchedulerServiceServicer_to_server(
+        SchedulerServiceServicer(), server
+    )
+
+    # Start the server.
+    server.add_insecure_port(f"[::]:{FLAGS.port}")
+    await server.start()
+    print("Initialized ERDOS Scheduling RPC Server on port", FLAGS.port)
+    await server.wait_for_termination()
+
+
+def main(argv):
+    # Parse the command-line flags
+    flags.FLAGS(argv)
+
+    # Access the value of the flag
+    multiplier = flags.FLAGS.spark_task_duration_multiplier
+    override_worker_cpus = flags.FLAGS.override_worker_cpu_count
+    
+    # Your application logic here
+    print("Multiplier:", multiplier)
+    print("Override worker CPUs:", override_worker_cpus)
+    
+    # Create an asyncio event loop
+    loop = asyncio.get_event_loop()
+
+    # Run the event loop until serve() completes
+    try:
+        loop.run_until_complete(serve())
+    finally:
+        loop.close()
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/rpc/spark_erdos_setup.md b/rpc/spark_erdos_setup.md
new file mode 100644
index 00000000..fe8f3678
--- /dev/null
+++ b/rpc/spark_erdos_setup.md
@@ -0,0 +1,246 @@
+# Setup Instructions for Spark Mirror and ERDOS
+
+This README provides step-by-step instructions to set up the environment, compile the Spark Mirror, and build the ERDOS scheduling simulator.
+
+## Prerequisites
+- Conda
+- Git
+- [Java Development Kit (JDK) 17.0.9](https://openjdk.org/)
+
+---
+
+## Step 0A: Create Conda Environment
+```bash
+conda create -n <env_name> python=3.10
+```
+
+### Activate the environment:
+```bash
+conda activate <env_name>
+```
+
+### If jdk17.0.9 isn't installed, install it for <env_name>
+```bash
+conda install -c conda-forge openjdk=17.0.9
+```
+
+## Step 0B: Setup TPCH (dataset, jar) workload
+Build the dataset
+```bash
+cd /path/to/tpch-spark/dbgen
+
+make
+
+./dbgen
+```
+
+Running `./dbgen` above creates a dataset of scale factor `s` of `1` (default) i.e. 1GB.
+
+> NOTE: Had updated the scala version to 2.13.0 in tpch.sbt. The sbt version used was `1.9.7`.
+
+Next, we build the target for `tpch-spark`:
+```bash
+sbt package
+```
+
+> NOTE: In case of errors in building the target, check `openjdk` version. It should be `17` and not `21`.
+
+
+## Step 1: Setup `spark-mirror`
+Clone the repository with submodules
+```bash
+git clone https://github.com/dhruvsgarg/spark_mirror.git --recursive
+```
+
+> NOTE: If the submodule was cloned earlier but has been updated since, `git fetch --all` will not be able to track those changes. To pull in updates
+> from submodule's parent, run `git submodule update --init --recursive`.
+
+### Verify branch
+Verify or set current branch `erdos-spark-integration`
+
+### Verify env variable `SPARK_HOME`
+Verify or set `SPARK_HOME` to point to the correct location of `spark-mirror`.
+
+### Verify env variable `JAVA_HOME`
+> NOTE: `JAVA_HOME` should automatically get set to `/path/to/anaconda3/envs/<env_name>/lib/jvm`
+
+### For first time compilation (entire package)
+```bash
+./build/sbt package
+```
+
+### For subsequent, quicker iterations
+Start the interactive shell
+```bash
+./build/sbt
+```
+
+Switch to project spark-core
+```bash
+project core
+```
+
+Compile and then package
+```bash
+compile
+package
+```
+
+### Fix guava versions for ERDOS-Spark integration
+Fresh compile+package of spark adds `guava-14.0.1.jar` under `/path/to/spark_mirror/assembly/target/scala-2.13/jars/`.
+This jar interferes with gRPC which requires a `guava-31` jar. To fix:
+- Remove existing `guava-14` jar: `rm assembly/target/scala-2.13/jars/guava-14.0.1.jar`
+- Run `./sbin/patch-erdos.sh`
+- Verify `guava-31.0.1-jre.jar` exists under `assembly/target/scala-2.13/jars/`
+
+### Update `PATH` with spark bin files
+```bash
+export PATH=$PATH:/path/to/spark_mirror/bin
+```
+
+## Step 2: Compile ERDOS
+> NOTE: The `erdos-scheduling-simulator` in Step 2 refers to the seperately cloned repository. It is not the `erdos-scheduling-simulator` submodule within
+the `spark-mirror` repository.
+
+### Clone repo
+```bash
+git clone https://github.com/erdos-project/erdos-scheduling-simulator.git --recursive
+```
+
+### Install requirements for the package
+```bash
+pip install -r requirements.txt
+```
+
+### Set `GUROBI_DIR`
+```bash
+export GUROBI_DIR=/serenity/scratch/dgarg/gurobi/gurobi1003/linux64
+```
+
+### Build inside schedulers/tetrisched/build/
+```bash
+export CMAKE_INSTALL_MODE=ABS_SYMLINK
+
+cmake .. -DINSTALL_GTEST=OFF -DTBB_INSTALL=OFF
+```
+
+* Verify that python bindings are written to the new `<env_name>` conda env and not some old env
+
+### Run make
+```bash
+make -j install
+```
+
+### Test that simulator works with `simple_av_workload`
+> NOTE: Might need to create `experiments` sub-directory if it doesnt already exist
+```bash
+python3 main.py --flagfile=configs/simple_av_workload.conf > experiments/simple_av_workload_test.output
+```
+The TaskGraph should complete and meet its deadline.
+
+
+## Step 3: Spark-Erdos service functionality test
+> NOTE: As in step 2, the `erdos-scheduling-simulator` here also refers to the seperately cloned repository.
+
+From the base directory:
+
+### Install the requirements
+```bash
+pip install -r rpc/requirements.txt
+```
+
+### Run protoc to generate the service and message definitions using
+```bash
+python -m grpc_tools.protoc -I./rpc/protos --python_out=. --grpc_python_out=. ./rpc/protos/rpc/erdos_scheduler.proto
+```
+
+### Run the service
+```bash
+python -m rpc.service --enforce_deadlines --scheduler_runtime=0 
+```
+
+The above command uses the default argument values from the `service.py` and `main.py`. The default scheduler is `EDF`. Other options available for the
+service are `FIFO` and `TetriSched`. The DSched scheduler is a specific instantiation of the `TetriSched` scheduler. The other schedulers can be run
+as follows:
+
+#### To instantiate FIFO scheduler for the service:
+```bash
+python -m rpc.service --scheduler=FIFO --enforce_deadlines --scheduler_runtime=0 
+```
+
+#### To instantiate DSched scheduler for the service:
+```bash
+python -m rpc.service --scheduler=TetriSched --enforce_deadlines --scheduler_runtime=0 --release_taskgraphs --opt_passes=CRITICAL_PATH_PASS --opt_passes=CAPACITY_CONSTRAINT_PURGE_PASS --oppasses=DYNAMIC_DISCRETIZATION_PASS --retract_schedules --scheduler_max_occupancy_threshold=0.999 --finer_discretization_at_prev_solution --scheduler_selective_rescheduling --scheduler_reconsideration_period=0.6  --scheduler_time_discretization=1 --scheduler_max_time_discretization=5 --finer_discretization_window=5 --scheduler_log_to_file  
+```
+
+### Run local tests for the erdos-spark service
+> NOTE: Verify that `pytest` is installed in the `<env_name>`. Else first do `pip install pytest`. Once installed, run the tests using:
+```bash
+pytest tests/test_service.py
+```
+
+## Step 4: Running ERDOS with Spark backend
+
+### Start the service
+```bash
+python -m rpc.service
+```
+Refer to the above section to instantiate different schedulers for the service. 
+
+> NOTE: Since we emulate a 20-node spark cluster on a single system, an additional flag `--override_worker_cpu_count` needs to be passed in the
+> service launch command.
+
+### Start all components of the spark cluster
+Run the following commands from the root directory of the `spark-mirror` repository.
+
+Also, verify that environment variable `SPARK_HOME` is set correctly to point to the path of `spark_mirror`
+
+* Start Spark Master
+```bash
+./sbin/start-master.sh --host <HOST_IP> --properties-file /path/to/spark_mirror/conf/<CONFIG_FILE_NAME>.conf
+```
+
+* Start Spark Worker
+```bash
+./sbin/start-worker.sh spark://<HOST_IP>:7077 --properties-file /path/to/spark_mirror/conf/<CONFIG_FILE_NAME>.conf
+```
+
+* Start Spark History Server
+```bash
+./sbin/start-history-server.sh --properties-file /path/to/spark_mirror/conf/<CONFIG_FILE_NAME>.conf
+```
+
+At this point, the spark framework should be registered with the erdos-service.
+
+### Viewing spark cluster status
+Start a ssh tunnel to the node hosting the spark cluster and access port `18080` using the command:
+```bash
+ssh -L 18080:<HOST_IP>:18080 <username>@<node_ip>
+```
+
+Once this command succeeds, you can view the History Server on your laptop's browser at URL: `localhost:18080`
+
+> NOTE: Same process needs to be repeated to view Master and Worker UIs. They run on ports `8080` and `8081` respectively.
+
+### Submitting a test spark application
+To be submitted from within the `tpch-spark` repo:
+```bash
+/path/to/spark_mirror/bin/spark-submit --deploy-mode cluster --master spark://<NODE_IP>:7077 --conf 'spark.port.maxRetries=132' --conf 'spark.eventLog.enabled=true' --conf 'spark.eventLog.dir=/path/to/event_log' --conf 'spark.sql.adaptive.enabled=false' --conf 'spark.sql.adaptive.coalescePartitions.enabled=false' --conf 'spark.sql.autoBroadcastJoinThreshold=-1' --conf 'spark.sql.shuffle.partitions=1' --conf 'spark.sql.files.minPartitionNum=1' --conf 'spark.sql.files.maxPartitionNum=1' --conf 'spark.app.deadline=120' --class 'main.scala.TpchQuery' target/scala-2.13/spark-tpc-h-queries_2.13-1.0.jar "4" "50" "50"
+```
+
+The above job submission is parameterized by `(DEADLINE, QUERY_NUM, DATASET_SIZE, MAX_CORES)`. An example input value for this tuple is 
+`(120, 4, 50, 50)`.
+> Refer to `launch_expt_script.py` in `tpch-spark` for more details on eligible values for these parameters and how they are used.
+
+> NOTE: By default, env variable `TPCH_INPUT_DATA_DIR` will look for `dbgen` inside the current working directory. While it works for `spark-submit`
+> issued from inside the `tpch-spark` repository, it needs to be explicitly set otherwise. 
+
+Once submitted, review the application's runtime status on the Spark Web UI.
+
+### Shutdown cluster
+* To stop the master and worker(s) after the experiment concludes, run:
+```bash
+./sbin/stop-all.sh
+```
+
+> NOTE: This command does not terminate the History Server process.
\ No newline at end of file
diff --git a/rpc/tpch_utils.py b/rpc/tpch_utils.py
index ebc4e3cd..48b28f83 100644
--- a/rpc/tpch_utils.py
+++ b/rpc/tpch_utils.py
@@ -2,106 +2,45 @@
 
 import ast
 import json
+import yaml
 import os
 from typing import Mapping, Sequence
 
 import networkx as nx
 import numpy as np
 
-HOME_TPCH_DIR = "../profiles/workload/tpch_decima/"
-TPCH_SUBDIR = "2g/"
+from data.tpch_loader import get_all_stage_info_for_query
 
 
-class SetWithCount(object):
-    """
-    allow duplication in set
-    """
-
-    def __init__(self):
-        self.set = {}
-
-    def __contains__(self, item):
-        return item in self.set
-
-    def add(self, item):
-        if item in self.set:
-            self.set[item] += 1
-        else:
-            self.set[item] = 1
-
-    def clear(self):
-        self.set.clear()
-
-    def remove(self, item):
-        self.set[item] -= 1
-        if self.set[item] == 0:
-            del self.set[item]
-
-
-def pre_process_task_duration(task_duration):
-    # remove fresh durations from first wave
-    clean_first_wave = {}
-    for e in task_duration["first_wave"]:
-        clean_first_wave[e] = []
-        fresh_durations = SetWithCount()
-        # O(1) access
-        for d in task_duration["fresh_durations"][e]:
-            fresh_durations.add(d)
-        for d in task_duration["first_wave"][e]:
-            if d not in fresh_durations:
-                clean_first_wave[e].append(d)
-            else:
-                # prevent duplicated fresh duration blocking first wave
-                fresh_durations.remove(d)
-
-
-def get_all_stage_info_for_query(query_num):
-    task_durations = np.load(
-        os.path.join(
-            HOME_TPCH_DIR, TPCH_SUBDIR, "task_duration_" + str(query_num) + ".npy"
-        ),
-        allow_pickle=True,
-    ).item()
-
-    num_nodes = len(task_durations)
-
-    stage_info = {}
-
-    for n in range(num_nodes):
-        task_duration = task_durations[n]
-        e = next(iter(task_duration["first_wave"]))
-        # NOTE: somehow only picks the first element {2: [n_tasks_in_ms]}
-
-        num_tasks = len(task_duration["first_wave"][e]) + len(
-            task_duration["rest_wave"][e]
-        )
-
-        # remove fresh duration from first wave duration
-        # drag nearest neighbor first wave duration to empty spots
-        pre_process_task_duration(task_duration)
-        rough_duration = np.mean(
-            [i for t in task_duration["first_wave"].values() for i in t]
-            + [i for t in task_duration["rest_wave"].values() for i in t]
-            + [i for t in task_duration["fresh_durations"].values() for i in t]
-        )
-
-        curr_stage = {
-            "stage_id": n,
-            "num_tasks": num_tasks,
-            "avg_task_duration": round(rough_duration),
-        }
-        stage_info[n] = curr_stage
-
-    return stage_info
+TPCH_PARENT_DIR = "/home/dgarg39/erdos-scheduling-simulator/profiles/workload/tpch/"
 
 
 def get_base_tpch_graph_structure(query_num):
-    # use query_num to read string from file
-    with open(os.path.join(HOME_TPCH_DIR, "query_dag.json")) as f:
-        tpch_query_json = json.load(f)
+    with open(os.path.join(TPCH_PARENT_DIR, "queries.yaml")) as f:
+        tpch_query_yaml = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Extract the graph structure for the given query number
+    query_graph = None
+    for graph in tpch_query_yaml["graphs"]:
+        if graph["name"] == f"Q{query_num}":
+            query_graph = graph["graph"]
+            break
+
+    if query_graph is None:
+        raise ValueError(f"Query number {query_num} not found in the YAML file")
+
+    # Convert the graph structure to a format suitable for nx.DiGraph
+    query_dependency = []
+    for node in query_graph:
+        if "children" in node:
+            for child in node["children"]:
+                query_dependency.append((node["name"], child))
+        else:
+            # Ensure each tuple has two elements by adding a dummy node
+            query_dependency.append((node["name"], None))
 
-    # get query dependency from file
-    query_dependency = ast.literal_eval(tpch_query_json["query_number"][str(query_num)])
+    # Remove any tuples where the second element is None
+    query_dependency = [edge for edge in query_dependency if edge[1] is not None]
 
     # convert job structure into a nx graph
     base_tpch_graph = nx.DiGraph(query_dependency)
diff --git a/schedulers/tetrisched_scheduler.py b/schedulers/tetrisched_scheduler.py
index 6cbeb425..3198faed 100644
--- a/schedulers/tetrisched_scheduler.py
+++ b/schedulers/tetrisched_scheduler.py
@@ -601,11 +601,9 @@ def schedule(
         # Construct the STRL expression.
         scheduler_start_time = time.time()
         if len(tasks_to_be_scheduled) > 0 and any(
-            # If there is a Task belonging to a TaskGraph that hasn't been previously
-            # considered for scheduling and belongs to a TaskGraph that hasn't been
-            # cancelled, then we run the scheduler.
+            # If there is a Task belonging to a TaskGraph that hasn't
+            # been cancelled, then we run the scheduler.
             task.state != TaskState.SCHEDULED
-            and task.task_graph not in self._previously_considered_task_graphs
             and task.task_graph not in cancelled_task_graphs
             for task in tasks_to_be_scheduled
         ):
diff --git a/scripts/run_service_experiments.py b/scripts/run_service_experiments.py
new file mode 100644
index 00000000..5c540b68
--- /dev/null
+++ b/scripts/run_service_experiments.py
@@ -0,0 +1,258 @@
+import argparse
+import subprocess
+import time
+import traceback
+from pathlib import Path
+from dataclasses import dataclass
+from datetime import datetime
+
+
+def bang(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
+    cmd = [str(part) for part in cmd]
+    print(" ".join(cmd))
+    if dry_run:
+        return
+    p = subprocess.Popen(cmd, stdout=stdout, stderr=stderr)
+    return p
+
+
+def must(cmd, dry_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
+    p = bang(cmd, dry_run, stdout, stderr)
+    if not dry_run:
+        if p.wait() != 0:
+            stdout, stderr = p.communicate()
+            raise Exception(f"Command failed. stdout: {stdout}. stderr: {stderr}.")
+        return p
+
+
+@dataclass
+class Service:
+    service_args: any
+    spark_mirror_path: Path
+    spark_master_ip: str
+    output_dir: Path
+    dry_run: bool
+
+    _service = None
+    _master = None
+    _worker = None
+
+    def __enter__(self):
+        log_file = self.output_dir / "service.log"
+        csv_file = self.output_dir / "service.csv"
+
+        # launch service
+        with (
+            open(self.output_dir / "service.stdout", "w") as f_out,
+            open(self.output_dir / "service.stderr", "w") as f_err,
+        ):
+            self._service = bang(
+                [
+                    *("python3", "-m", "rpc.service"),
+                    *("--log_file_name", log_file),
+                    *("--csv_file_name", csv_file),
+                    *self.service_args,
+                ],
+                self.dry_run,
+                stdout=f_out,
+                stderr=f_err
+            )
+
+        # sleep for some time
+        if not self.dry_run:
+            time.sleep(3)
+
+        try:
+            # launch spark master and worker
+            self._master = must(
+                [
+                    f"{self.spark_mirror_path}/sbin/start-master.sh",
+                    *("--host", self.spark_master_ip),
+                    *(
+                        "--properties-file",
+                        f"{self.spark_mirror_path}/conf/spark-dg-config.conf",
+                    ),
+                ],
+                self.dry_run,
+            )
+            self._worker = must(
+                [
+                    f"{self.spark_mirror_path}/sbin/start-worker.sh",
+                    f"spark://{self.spark_master_ip}:7077",
+                    *(
+                        "--properties-file",
+                        f"{self.spark_mirror_path}/conf/spark-dg-config.conf",
+                    ),
+                ],
+                self.dry_run,
+            )
+        except Exception as e:
+            self.clean()
+            raise e
+
+        if not self.dry_run:
+            time.sleep(5)
+
+        return self
+
+    def wait(self):
+        self._service.wait()
+
+    def clean(self):
+        if self._service:
+            self._service.terminate()
+        if self._master:
+            must([f"{self.spark_mirror_path}/sbin/stop-master.sh"], self.dry_run)
+        if self._worker:
+            must([f"{self.spark_mirror_path}/sbin/stop-worker.sh"], self.dry_run)
+
+    def __exit__(self, type, value, traceback):
+        self.clean()
+
+
+@dataclass
+class Launcher:
+    launcher_args: any
+    spark_mirror_path: Path
+    spark_master_ip: str
+    tpch_spark_path: Path
+    output_dir: Path
+    dry_run: bool
+
+    def launch(self):
+        with (
+            open(self.output_dir / "launcher.stdout", "w") as f_out,
+            open(self.output_dir / "launcher.stderr", "w") as f_err,
+        ):
+            must(
+                [
+                    *("python3", "-u", "-m", "rpc.launch_tpch_queries"),
+                    *self.launcher_args,
+                    *("--spark-master-ip", self.spark_master_ip),
+                    *("--spark-mirror-path", self.spark_mirror_path),
+                    *("--tpch-spark-path", self.tpch_spark_path),
+                ],
+                self.dry_run,
+                stdout=f_out,
+                stderr=f_err,
+            )
+
+
+@dataclass
+class Experiment:
+    name: str
+    service_args: any
+    launcher_args: any
+
+    def run(self, args):
+        output_dir = args.output_dir / (self.name + '-' + datetime.now().isoformat())
+        if not output_dir.exists():
+            output_dir.mkdir(parents=True)
+        with open(output_dir / "service.args", "w") as f:
+            print(*self.service_args, sep='\n', file=f)
+        with open(output_dir / "launcher.args", "w") as f:
+            print(*self.launcher_args, sep='\n', file=f)
+
+        with Service(
+            service_args=self.service_args,
+            spark_mirror_path=args.spark_mirror_path,
+            spark_master_ip=args.spark_master_ip,
+            output_dir=output_dir,
+            dry_run=args.dry_run,
+        ) as s:
+            Launcher(
+                launcher_args=self.launcher_args,
+                spark_mirror_path=args.spark_mirror_path,
+                spark_master_ip=args.spark_master_ip,
+                tpch_spark_path=args.tpch_spark_path,
+                output_dir=output_dir,
+                dry_run=args.dry_run,
+            ).launch()
+            s.wait()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Prints commands that will be executed for each experiment",
+    )
+    parser.add_argument(
+        "--spark-mirror-path",
+        type=Path,
+        required=True,
+        help="Path to spark-mirror repository",
+    )
+    parser.add_argument(
+        "--spark-master-ip",
+        type=str,
+        required=True,
+        help="IP address of node running Spark master",
+    )
+    parser.add_argument(
+        "--tpch-spark-path",
+        type=Path,
+        required=True,
+        help="Path to TPC-H Spark repository",
+    )
+    parser.add_argument("--output-dir", type=Path, default=Path("exp-output"))
+    args = parser.parse_args()
+
+    if not args.output_dir.exists():
+        args.output_dir.mkdir(parents=True)
+
+    base_args = [
+        "--enforce_deadlines",
+        "--override_worker_cpu_count",
+    ]
+    variance_args = [
+        *("--min_deadline_variance", 10),
+        *("--max_deadline_variance", 25),
+    ]
+    edf_args = [
+        *("--scheduler", "EDF"),
+    ]
+    dsched_args = [
+        *("--scheduler", "TetriSched"),
+        "--release_taskgraphs",
+        *("--opt_passes", "CRITICAL_PATH_PASS"),
+        *("--opt_passes", "CAPACITY_CONSTRAINT_PURGE_PASS"),
+        *("--opt_passes", "DYNAMIC_DISCRETIZATION_PASS"),
+        "--retract_schedules",
+        *("--scheduler_max_occupancy_threshold", 0.999),
+        "--finer_discretization_at_prev_solution",
+        "--scheduler_selective_rescheduling",
+        *("--scheduler_reconsideration_period", 0.6),
+        *("--scheduler_time_discretization", 1),
+        *("--scheduler_max_time_discretization", 5),
+        *("--finer_discretization_window", 5),
+        *("--scheduler_plan_ahead_no_consideration_gap", 1),
+    ]
+    experiments = [
+        Experiment(
+            name="dsched-q300-hard",
+            service_args=[
+                *base_args,
+                *dsched_args,
+                *variance_args,
+            ],
+            launcher_args=[
+                *("--num_queries", 300),
+                *("--variable_arrival_rate", 0.052),
+            ],
+        ),
+    ]
+
+    for i, experiment in enumerate(experiments):
+        try:
+            print(f"=== {experiment.name} ({i+1}/{len(experiments)}) ===")
+            experiment.run(args)
+            print("=== done ===")
+        except Exception as e:
+            print(traceback.format_exc())
+            print(f"Failed to run experiment '{experiment}'. Exception: '{e}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/simulator.py b/simulator.py
index 1ccf06fd..a752c58f 100644
--- a/simulator.py
+++ b/simulator.py
@@ -5,7 +5,7 @@
 from enum import Enum
 from functools import total_ordering
 from operator import attrgetter, itemgetter
-from typing import Mapping, Optional, Sequence
+from typing import Mapping, Optional, Sequence, Callable, Dict, List
 
 import absl  # noqa: F401
 
@@ -43,6 +43,7 @@ class EventType(Enum):
     SCHEDULER_FINISHED = 12  # Signifies the end of the scheduler loop.
     SIMULATOR_END = 13  # Signify the end of the simulator loop.
     LOG_UTILIZATION = 14  # Ask the simulator to log worker pool utilization.
+    LOG_STATS = 15  # Log simulator statistics
 
     def __lt__(self, other) -> bool:
         # This method is used to order events in the event queue. We prioritize
@@ -223,6 +224,9 @@ def reheapify(self):
     def __len__(self) -> int:
         return len(self._event_queue)
 
+    def __str__(self) -> str:
+        return str(self._event_queue)
+
 
 class Simulator(object):
     """A `Simulator` simulates the execution of the different tasks in the
@@ -251,7 +255,7 @@ def __init__(
         self,
         worker_pools: WorkerPools,
         scheduler: BaseScheduler,
-        workload_loader: BaseWorkloadLoader,
+        workload_loader: BaseWorkloadLoader = None,
         loop_timeout: EventTime = EventTime(time=sys.maxsize, unit=EventTime.Unit.US),
         scheduler_frequency: EventTime = EventTime(time=-1, unit=EventTime.Unit.US),
         _flags: Optional["absl.flags"] = None,
@@ -259,7 +263,7 @@ def __init__(
         if not isinstance(scheduler, BaseScheduler):
             raise ValueError("Scheduler must implement the BaseScheduler interface.")
 
-        if not isinstance(workload_loader, BaseWorkloadLoader):
+        if workload_loader and not isinstance(workload_loader, BaseWorkloadLoader):
             raise ValueError(
                 "WorkloadLoader must implement the BaseWorkloadLoader interface."
             )
@@ -337,10 +341,14 @@ def event_representation_filter(record):
         self.__log_utilization(self._simulator_time)
 
         # Internal data.
-        self._last_scheduler_start_time = self._simulator_time
+        self._last_scheduler_start_time = EventTime.invalid()
         self._next_scheduler_event = None
         self._last_scheduler_placements: Optional[Placements] = None
 
+        # Stores current placements for tasks of a task graph
+        # task_graph => {task_id => placement}
+        self._current_task_graph_placements: Dict[str, Dict[str, Placement]] = {}
+
         # A Cache from the TaskID to a future Placement event in the EventQueue.
         # The Simulator uses this bookkeeping to revoke / invalidate decisions made
         # by the past scheduler invocations.
@@ -372,6 +380,14 @@ def event_representation_filter(record):
         self._finished_task_graphs = 0
         self._missed_task_graph_deadlines = 0
 
+        # Is the simulator orchestrated?
+        self._orchestrated = _flags.orchestrated
+
+        # Minimum duration by which to push task placements
+        self._min_placement_push_duration = EventTime(
+            _flags.min_placement_push_duration, EventTime.Unit.US
+        )
+
         # Initialize the event queue.
         # To make the system continue working the loop, we add three events:
         # - SIMULATOR_START: A notional event start the simulator and log into the CSV.
@@ -391,16 +407,20 @@ def event_representation_filter(record):
             sim_start_event,
         )
 
+        if self._orchestrated:
+            return
+
         # Second, create the UPDATE_WORKLOAD event to retrieve the latest Workload.
-        upate_workload_event = Event(
-            event_type=EventType.UPDATE_WORKLOAD, time=self._simulator_time
-        )
-        self._event_queue.add_event(upate_workload_event)
-        self._logger.info(
-            "[%s] Added %s to the event queue.",
-            self._simulator_time.time,
-            upate_workload_event,
-        )
+        if self._workload_loader:
+            upate_workload_event = Event(
+                event_type=EventType.UPDATE_WORKLOAD, time=self._simulator_time
+            )
+            self._event_queue.add_event(upate_workload_event)
+            self._logger.info(
+                "[%s] Added %s to the event queue.",
+                self._simulator_time.time,
+                upate_workload_event,
+            )
 
         # Third, create the SCHEDULER_START event to invoke the scheduler.
         sched_start_event = Event(
@@ -465,19 +485,15 @@ def dry_run(self) -> None:
                     )
 
     def simulate(self) -> None:
-        """Run the simulator loop.
+        """Run the simulator loop to fixpoint.
 
         This loop requires the `Workload` to be populated with the `TaskGraph`s whose
         execution is to be simulated using the Scheduler.
         """
-        # Run the simulator loop.
-        while True:
-            time_until_next_event = self._event_queue.peek().time - self._simulator_time
 
-            # If there are any running tasks, step through the execution of the
-            # Simulator until the closest remaining time.
+        def f():
+            time_until_next_event = self.__time_until_next_event()
             running_tasks = self._worker_pools.get_placed_tasks()
-
             if len(running_tasks) > 0:
                 # There are running tasks, figure out the minimum remaining
                 # time across all the tasks.
@@ -496,20 +512,68 @@ def simulate(self) -> None:
                 # the next event in the queue, step all workers until the
                 # completion of that task, otherwise, handle the next event.
                 if min_task_remaining_time < time_until_next_event:
-                    self.__step(step_size=min_task_remaining_time)
+                    step_size = min_task_remaining_time
                 else:
-                    # NOTE: We step here so that all the Tasks that are going
-                    # to finish as a result of this step have their TASK_FINISHED
-                    # events processed first before any future placement occurs
-                    # that is decided prior.
-                    self.__step(step_size=time_until_next_event)
-                    if self.__handle_event(self._event_queue.next()):
-                        break
+                    step_size = time_until_next_event
             else:
-                # Step until the next event is supposed to be executed.
-                self.__step(step_size=time_until_next_event)
-                if self.__handle_event(self._event_queue.next()):
-                    break
+                step_size = time_until_next_event
+            return None if time_until_next_event.is_invalid() else step_size
+
+        self.__simulate_f(should_step=f)
+
+    def tick(self, until: EventTime) -> None:
+        """Tick the simulator until the specified time"""
+
+        def f():
+            time_until_next_event = self.__time_until_next_event()
+
+            if (
+                not time_until_next_event.is_invalid()
+                and (time_until_next_event + self._simulator_time) <= until
+            ):
+                return time_until_next_event
+
+            return None
+
+        self.__simulate_f(should_step=f)
+
+    def __simulate_f(self, should_step: Callable[None, Optional[EventTime]]) -> None:
+        """Steps the simulator while a predicate is satisfied.
+
+        This method continuously advances the simulation by calling the
+        provided `should_step` function, which determines the size of each
+        simulation step. The simulation continues until `should_step` returns
+        None, indicating that stepping should stop.
+
+        Args:
+            should_step (Callable[[EventTime], bool]):
+                A predicate function that determines the next step size for the simulation.
+                - If the function returns an EventTime value, the simulator steps by that amount.
+                - If the function returns None, the simulation stops.
+        """
+        while True:
+            step_size = should_step()
+            if not step_size:
+                break
+            self.__step(step_size=step_size)
+            if self._event_queue.peek() and self.__handle_event(
+                self._event_queue.next()
+            ):
+                break
+
+    def get_current_placements_for_task_graph(
+        self, task_graph_name: str
+    ) -> List[Placement]:
+        if task_graph_name not in self._current_task_graph_placements:
+            self._logger.warning(f"Cannot recognize task graph '{task_graph_name}'")
+            return []
+        return list(self._current_task_graph_placements[task_graph_name].values())
+
+    def __time_until_next_event(self) -> EventTime:
+        if self._event_queue.peek():
+            return self._event_queue.peek().time - self._simulator_time
+        else:
+            return EventTime.invalid()
 
     def __handle_scheduler_start(self, event: Event) -> None:
         """Handle the SCHEDULER_START event. The method invokes the scheduler, and adds
@@ -518,6 +582,10 @@ def __handle_scheduler_start(self, event: Event) -> None:
         Args:
             event (`Event`): The event to handle.
         """
+
+        if self._last_scheduler_start_time == event.time:
+            return
+
         # Log the required CSV information.
         currently_placed_tasks = self._worker_pools.get_placed_tasks()
         schedulable_tasks = self._workload.get_schedulable_tasks(
@@ -669,6 +737,9 @@ def __create_events_from_task_placement_skip(
                         task=cancelled_task,
                     )
                 )
+                self._current_task_graph_placements[placement.task.task_graph][
+                    placement.task.id
+                ] = placement
 
             if task_graph.is_cancelled():
                 released_tasks_from_new_task_graph = (
@@ -921,6 +992,10 @@ def count_placed_tasks(placements: Placements):
                 )
             )
 
+        # NOP if there are no previous placements
+        if self._last_scheduler_placements is None:
+            return
+
         num_placed = count_placed_tasks(self._last_scheduler_placements)
         num_unplaced = count_placed_tasks(self._last_scheduler_placements) - num_placed
         scheduler_runtime = event.time - self._last_scheduler_start_time
@@ -1027,18 +1102,19 @@ def count_placed_tasks(placements: Placements):
         # Reset the available tasks and the last task placement.
         self._last_scheduler_placements = None
 
-        # The scheduler has finished its execution, insert an event for the next
-        # invocation of the scheduler.
-        next_sched_event = self.__get_next_scheduler_event(
-            event,
-            self._scheduler_frequency,
-            self._last_scheduler_start_time,
-            self._loop_timeout,
-        )
-        self._event_queue.add_event(next_sched_event)
-        self._logger.info(
-            "[%s] Added %s to the event queue.", event.time.time, next_sched_event
-        )
+        if not self._orchestrated:
+            # The scheduler has finished its execution, insert an event for the next
+            # invocation of the scheduler.
+            next_sched_event = self.__get_next_scheduler_event(
+                event,
+                self._scheduler_frequency,
+                self._last_scheduler_start_time,
+                self._loop_timeout,
+            )
+            self._event_queue.add_event(next_sched_event)
+            self._logger.info(
+                "[%s] Added %s to the event queue.", event.time.time, next_sched_event
+            )
 
         # Now that all the tasks are placed, ask the simulator to log the resource
         # utilization and quit later, if requested.
@@ -1066,6 +1142,7 @@ def __handle_task_cancellation(self, event: Event) -> None:
             f"{event.task.timestamp},{event.task.id},{event.task.task_graph},"
             f"{event.task.slowest_execution_strategy.runtime.time}"
         )
+        self.log_stats(event.time)
 
         # If the task already had a placement, we remove the placement from our queue.
         if event.task.id in self._future_placement_events:
@@ -1149,8 +1226,13 @@ def __handle_task_finished(self, event: Event) -> None:
         task_placed_at_worker_pool = self._worker_pools.get_worker_pool(
             event.task.worker_pool_id
         )
+
         task_placed_at_worker_pool.remove_task(current_time=event.time, task=event.task)
-        event.task.finish()
+
+        # Remove the task from it's task graph's current placements
+        del self._current_task_graph_placements[event.task.task_graph][event.task.id]
+
+        event.task.finish(event.time)
 
         # Log the TASK_FINISHED event into the CSV.
         self._finished_tasks += 1
@@ -1170,13 +1252,21 @@ def __handle_task_finished(self, event: Event) -> None:
                 if task_graph.deadline > event.time
                 else event.time - task_graph.deadline
             )
+
+            # Remove task graph from current task graph placements map
+            del self._current_task_graph_placements[event.task.task_graph]
+
             self._csv_logger.debug(
                 f"{event.time.time},TASK_GRAPH_FINISHED,{task_graph.name},"
                 f"{task_graph.deadline.to(EventTime.Unit.US).time},"
                 f"{tardiness.to(EventTime.Unit.US).time}"
             )
+
             if task_graph.deadline < event.time:
                 self._missed_task_graph_deadlines += 1
+
+            self.log_stats(event.time)
+
             self._logger.info(
                 "[%s] Finished the TaskGraph %s with a deadline %s at the "
                 "completion of the task %s with a tardiness of %s.",
@@ -1306,6 +1396,72 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None:
         ), "Inconsistency in future placements."
         task_graph = workload.get_task_graph(task.task_graph)
         assert task_graph is not None, "Inconsistency in Task placement and Workload."
+        
+        # Subroutine to handle avoid automatic re-placement of tasks in the next timestep
+        # if they were unable to start either due to (i) parent task not finished or 
+        # (ii) worker not ready. The sub-tree rooted at the task is unscheduled and will
+        # be placed again in the next run of the scheduler.
+        def unschedule_subtree_rooted_at_task(task):
+            # Find all dependent tasks rooted from given task to unschedule
+            def subtree_tasks_to_unschedule(task):
+                tasks_to_unschedule = [task]
+                for child_task in task_graph.get_children(task):
+                    tasks_to_unschedule.extend(subtree_tasks_to_unschedule(child_task))
+                return tasks_to_unschedule
+
+            tasks_to_unschedule = subtree_tasks_to_unschedule(task)
+            self._logger.info("[%s] Going to unschedule tasks rooted from %s. "
+                              "List of tasks that will be unscheduled are: %s",
+                              event.time.time,
+                              task,
+                              tasks_to_unschedule)
+            for unschedule_task in tasks_to_unschedule:
+                if unschedule_task.id in self._future_placement_events:
+                    future_placement_event = self._future_placement_events[
+                        unschedule_task.id
+                        ]
+                    if future_placement_event.time > event.time:
+                        # Delete future event from event_queue and from future_placement_events
+                        self._event_queue.remove_event(future_placement_event)
+                        del self._future_placement_events[unschedule_task.id]
+                        msg = (
+                            f"[{event.time.time}] Retrieved future placement event {future_placement_event} "
+                            f"for task {unschedule_task} and removed it."
+                        )
+                        self._logger.info(msg)
+                    elif future_placement_event.time == event.time:
+                        # Cannot delete from event_queue, as this event is likely being processed
+                        del self._future_placement_events[unschedule_task.id]
+                        msg = (
+                            f"[{event.time.time}] Removed future placement event {future_placement_event} "
+                            f"for task {unschedule_task} at the same time."
+                        )
+                        self._logger.info(msg)
+                    else:
+                        msg = (
+                            f"[{event.time.time}] Future placement event {future_placement_event} for task "
+                            f"{unschedule_task} is in the past."
+                        )
+                        self._logger.warning(msg)
+                
+                # Unschedule the task
+                if unschedule_task.state == TaskState.SCHEDULED:
+                    unschedule_task.unschedule(event.time)
+                    self._csv_logger.debug(
+                        f"{event.time.time},TASK_UNSCHEDULED,{unschedule_task.name},{unschedule_task.timestamp},"
+                        f"{unschedule_task.id},{unschedule_task.task_graph}"
+                    )
+                    msg = (
+                        f"[{event.time.time}] Finished unscheduling of task {unschedule_task}."
+                    )
+                    self._logger.info(msg)
+                else:
+                    msg = (
+                        f"[{event.time.time}] Task {unschedule_task} was not in SCHEDULED state and was in "
+                        f"{unschedule_task.state} state. Skip unscheduling."
+                    )
+                    self._logger.info(msg)
+        
         if not task.is_ready_to_run(task_graph):
             if task.state == TaskState.CANCELLED or task_graph.is_cancelled():
                 # The Task was cancelled. Consume the event.
@@ -1330,32 +1486,20 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None:
                 return
             else:
                 # If the Task is not ready to run and wasn't cancelled,
-                # find the next possible time to try executing the task.
-                parent_completion_time = max(
-                    parent.remaining_time for parent in task_graph.get_parents(task)
-                )
-                next_placement_time = event.time + max(
-                    parent_completion_time, EventTime(1, EventTime.Unit.US)
-                )
-                next_placement_event = Event(
-                    event_type=event.event_type,
-                    time=next_placement_time,
-                    task=event.task,
-                    placement=event.placement,
-                )
-                self._future_placement_events[task.id] = next_placement_event
-                self._event_queue.add_event(next_placement_event)
+                # unschedule the task and its subtree.                
                 self._logger.info(
-                    "[%s] The Task %s was not ready to run, and has been pushed for "
-                    "later placement at %s.",
+                    "[%s] The Task %s was not ready to run. The task along with its "
+                    "sub-tree will be unscheduled.",
                     event.time.to(EventTime.Unit.US).time,
                     task,
-                    next_placement_time,
                 )
                 self._csv_logger.debug(
                     f"{event.time.time},TASK_NOT_READY,{task.name},{task.timestamp},"
                     f"{task.id},{event.placement.worker_pool_id}"
                 )
+                
+                # Unschedule the task and its subtree rooted at this task.
+                unschedule_subtree_rooted_at_task(task)
                 return
         # Initialize the task at the given placement time, and place it on
         # the WorkerPool.
@@ -1363,6 +1507,7 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None:
         assert (
             worker_pool is not None
         ), f"No WorkerPool found with ID: {event.placement.worker_pool_id}."
+
         success = worker_pool.place_task(
             task,
             execution_strategy=event.placement.execution_strategy,
@@ -1387,27 +1532,27 @@ def __handle_task_placement(self, event: Event, workload: Workload) -> None:
                 "[%s] Placed %s on %s.", event.time.time, task, worker_pool
             )
             del self._future_placement_events[task.id]
+            self._current_task_graph_placements[task.task_graph][
+                task.id
+            ] = event.placement
         else:
-            next_placement_time = event.time + EventTime(1, EventTime.Unit.US)
-            next_placement_event = Event(
-                event_type=event.event_type,
-                time=next_placement_time,
-                task=event.task,
-                placement=event.placement,
-            )
-            self._event_queue.add_event(next_placement_event)
-            self._future_placement_events[task.id] = next_placement_event
+            # If the placement was not successful, send the sub-tree of the taskgraph 
+            # rooted at this task back to its previous state. It allows the scheduler
+            # to re-schedule in its next run.
             self._logger.warning(
-                "[%s] Task %s cannot be placed on worker %s, pushing placement to %s.",
+                "[%s] Task %s couldn't be placed on worker %s. The task along with its "
+                "sub-tree will be unscheduled.",
                 event.time.time,
                 task,
-                worker_pool,
-                next_placement_time,
+                event.placement.worker_pool_id,
             )
             self._csv_logger.debug(
                 f"{event.time.time},WORKER_NOT_READY,{task.name},{task.timestamp},"
                 f"{task.id},{event.placement.worker_pool_id}"
             )
+            
+            # Unschedule the task and its subtree rooted at this task.
+            unschedule_subtree_rooted_at_task(task)
 
     def __handle_task_migration(self, event: Event) -> None:
         """Handles the TASK_MIGRATION event. This event must be followed by a
@@ -1503,6 +1648,9 @@ def __handle_update_workload(self, event: Event) -> None:
             raise ValueError(
                 f"__handle_update_workload called with event of type {event.type}."
             )
+        if not self._workload_loader:
+            raise ValueError("UPDATE_WORKLOAD event enqueued without workload_loader")
+
         updated_workload = self._workload_loader.get_next_workload(
             current_time=self._simulator_time
         )
@@ -1525,6 +1673,16 @@ def __handle_update_workload(self, event: Event) -> None:
 
             # Release the Tasks that have become available.
             releasable_tasks = self._workload.get_releasable_tasks()
+
+            # Ignore non-source tasks, they get auto-released when the parent finishes
+            def is_source_task(task):
+                task_graph = self._workload.get_task_graph(task.task_graph)
+                return task_graph.is_source_task(task)
+
+            releasable_tasks = [
+                task for task in releasable_tasks if is_source_task(task)
+            ]
+
             self._logger.info(
                 "[%s] The WorkloadLoader %s has %s TaskGraphs that released %s tasks.",
                 self._simulator_time.to(EventTime.Unit.US).time,
@@ -1539,19 +1697,36 @@ def __handle_update_workload(self, event: Event) -> None:
                 len(releasable_tasks),
             )
 
-            # Add the TaskGraphRelease events into the system.
+            # Add task graph entry in self._current_task_graph_placements to
+            # track its task placements
+            #
+            # In addition to newly added task graphs, self._workload also
+            # contains all previously released task graphs.
+            #
+            # So, we guard the addition of the entry on two conditions:
+            # (1) The task graph doesn't have an entry (we don't want to
+            #     nuke an existing one)
+            # (2) The task graph is not complete (we only keep the entry
+            #     alive while the task graph is running to avoid a memory
+            #     leak)
             for task_graph_name, task_graph in self._workload.task_graphs.items():
-                event = Event(
-                    event_type=EventType.TASK_GRAPH_RELEASE,
-                    time=task_graph.release_time,
-                    task_graph=task_graph_name,
-                )
-                self._event_queue.add_event(event)
-                self._logger.info(
-                    "[%s] Added %s to the event queue.",
-                    self._simulator_time.to(EventTime.Unit.US).time,
-                    event,
-                )
+                if (
+                    task_graph_name not in self._current_task_graph_placements
+                    and not task_graph.is_complete()
+                ):
+                    self._current_task_graph_placements[task_graph_name] = {}
+
+                    event = Event(
+                        event_type=EventType.TASK_GRAPH_RELEASE,
+                        time=task_graph.release_time,
+                        task_graph=task_graph_name,
+                    )
+                    self._event_queue.add_event(event)
+                    self._logger.info(
+                        "[%s] Added %s to the event queue.",
+                        self._simulator_time.to(EventTime.Unit.US).time,
+                        event,
+                    )
 
             max_release_time = self._simulator_time
             for task in releasable_tasks:
@@ -1577,7 +1752,8 @@ def __handle_update_workload(self, event: Event) -> None:
                     else self._simulator_time + self._workload_update_interval
                 ),
             )
-            self._event_queue.add_event(next_update_event)
+            # TODO(elton): Handle this properly
+            # self._event_queue.add_event(next_update_event)
             self._logger.info(
                 "[%s] Added %s to the event queue.",
                 self._simulator_time.time,
@@ -1657,17 +1833,16 @@ def __handle_event(self, event: Event) -> bool:
             self.__handle_scheduler_finish(event)
         elif event.event_type == EventType.SIMULATOR_END:
             # End of the simulator loop.
+            self.log_stats(event.time)
             self._csv_logger.debug(
-                f"{event.time.time},SIMULATOR_END,{self._finished_tasks},"
-                f"{self._cancelled_tasks},{self._missed_task_deadlines},"
-                f"{self._finished_task_graphs},"
-                f"{len(self._workload.get_cancelled_task_graphs())},"
-                f"{self._missed_task_graph_deadlines}"
+                f"{event.time.time},SIMULATOR_END",
             )
             self._logger.info("[%s] Ending the simulator loop.", event.time.time)
             return True
         elif event.event_type == EventType.LOG_UTILIZATION:
             self.__log_utilization(event.time)
+        elif event.event_type == EventType.LOG_STATS:
+            self.log_stats(event.time)
         else:
             raise ValueError(f"[{event.time}] Retrieved event of unknown type: {event}")
         return False
@@ -1680,7 +1855,9 @@ def __step(self, step_size: EventTime = EventTime(1, EventTime.Unit.US)) -> None
                 the clock (in us).
         """
         if step_size < EventTime.zero():
-            raise ValueError(f"Simulator cannot step backwards {step_size}")
+            raise ValueError(
+                f"[{self._simulator_time}] Simulator cannot step backwards {step_size}"
+            )
 
         # Step the simulator for the required steps and construct TASK_FINISHED events
         # for any tasks that were able to complete their execution.
@@ -1707,13 +1884,14 @@ def __step(self, step_size: EventTime = EventTime(1, EventTime.Unit.US)) -> None
             self._simulator_time.time,
             [event.task.unique_name for event in task_finished_events],
         )
-        for task_finished_event in task_finished_events:
-            self._event_queue.add_event(task_finished_event)
-            self._logger.info(
-                "[%s] Added %s to the event queue.",
-                self._simulator_time.time,
-                task_finished_event,
-            )
+        if not self._orchestrated:
+            for task_finished_event in task_finished_events:
+                self._event_queue.add_event(task_finished_event)
+                self._logger.info(
+                    "[%s] Added %s to the event queue.",
+                    self._simulator_time.time,
+                    task_finished_event,
+                )
 
     def __get_next_scheduler_event(
         self,
@@ -2006,9 +2184,18 @@ def __run_scheduler(self, event: Event) -> Event:
                 f"Received no Placements object from the Scheduler at {event.time}.",
             )
 
-        # Calculate the time at which the placements need to be applied.
         placement_time = event.time + placements.runtime
 
+        for placement in placements:
+            # If the placement is in the past, update it to match
+            # `placement_time`
+            # This scenario happens when the `scheduler_runtime` is non-zero.
+            if placement._placement_time and placement._placement_time < placement_time:
+                self._logger.warning(
+                    f"[{self._simulator_time}] Placement is in the past. Updating placement time from {placement._placement_time} to {placement_time}"
+                )
+                placement._placement_time = placement_time
+
         # Save the placements until the placement time arrives.
         self._last_scheduler_placements = placements
 
@@ -2038,3 +2225,12 @@ def __log_utilization(self, sim_time: EventTime):
                     f"{worker_pool_resources.get_allocated_quantity(resource)},"
                     f"{worker_pool_resources.get_available_quantity(resource)}"
                 )
+
+    def log_stats(self, sim_time: EventTime):
+        self._csv_logger.debug(
+            f"{sim_time.time},LOG_STATS,{self._finished_tasks},"
+            f"{self._cancelled_tasks},{self._missed_task_deadlines},"
+            f"{self._finished_task_graphs},"
+            f"{len(self._workload.get_cancelled_task_graphs())},"
+            f"{self._missed_task_graph_deadlines}"
+        )
diff --git a/tests/test_service.py b/tests/test_service.py
new file mode 100644
index 00000000..9f623445
--- /dev/null
+++ b/tests/test_service.py
@@ -0,0 +1,328 @@
+import re
+import time
+import subprocess
+
+import pytest
+import grpc
+from rpc import erdos_scheduler_pb2
+from rpc import erdos_scheduler_pb2_grpc
+
+
+@pytest.fixture(scope="module", autouse=True)
+def service():
+    process = subprocess.Popen(["python", "-m", "rpc.service", "--enforce_deadlines"])
+    channel = grpc.insecure_channel("localhost:50051")
+    try:
+        grpc.channel_ready_future(channel).result(timeout=5)
+        yield process
+    finally:
+        channel.close()
+        process.kill()
+
+
+def test_service():
+    channel = grpc.insecure_channel("localhost:50051")
+    stub = erdos_scheduler_pb2_grpc.SchedulerServiceStub(channel)
+
+    # Register a framework
+    request = erdos_scheduler_pb2.RegisterFrameworkRequest(
+        name="test_framework", uri="http://localhost/test", timestamp=1234567890
+    )
+    response = stub.RegisterFramework(request)
+    assert response.success and re.search(
+        r"Registered the framework 'test_framework' with URI http://localhost/test at UNIX time",
+        response.message,
+    )
+
+    # Register a worker
+    request = erdos_scheduler_pb2.RegisterWorkerRequest(
+        name="test_worker",
+        id="1234",
+        cores=100,
+        memory=1024,
+    )
+    response = stub.RegisterWorker(request)
+    assert response.success and re.search(
+        r"Registered worker \(id=1234, name=test_worker\)", response.message
+    )
+
+    # Try to fetch placements for an unregistered task graph
+    # Get placements for the task, should be empty
+    request = erdos_scheduler_pb2.GetPlacementsRequest(
+        timestamp=1234567890,
+        id="task-graph-0",
+    )
+    response = stub.GetPlacements(request)
+    assert not response.success and re.search(
+        r"Task graph with id \'task-graph-0\' is not registered or does not exist",
+        response.message,
+    )
+
+    # TODO: move to environment ready
+    # Register an incorrect TaskGraph
+    # request = erdos_scheduler_pb2.RegisterTaskGraphRequest(
+    #     id="task-graph",
+    #     name="TPCH Query 4 50 50",
+    #     timestamp=1234567890,
+    #     dependencies=[
+    #         {"key": {"id": 0, "name": "stage 0"}, "children_ids": [1, 2]},
+    #     ],
+    # )
+    # response = stub.RegisterTaskGraph(request)
+    # assert not response.success and re.search(
+    #     r"Failed to load TPCH query 4. Exception: Structure of dependencies provided for query number 4 does not match that of canonical dependencies",
+    #     response.message,
+    # )
+
+    # Register the first (correct) TaskGraph, it will be able to run
+    request = erdos_scheduler_pb2.RegisterTaskGraphRequest(
+        id="task-graph-0",
+        name="TPCH Query 4 50 50",
+        timestamp=1234567890,
+        dependencies=[
+            {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]},
+            {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]},
+            {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]},
+            {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]},
+            {"key": {"id": 4, "name": "stage 4"}, "children_ids": []},
+        ],
+    )
+    response = stub.RegisterTaskGraph(request)
+    assert (
+        response.success
+        and re.search(
+            r"Registered task graph 'task-graph-0' successfully",
+            response.message,
+        )
+        and response.num_executors == 10
+    )
+
+    # Introduce a 2s delay in getting the env ready
+    time.sleep(2)
+
+    # Mark the environment as ready
+    request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest(
+        id="task-graph-0",
+        num_executors=10,
+        timestamp=1234567890,
+    )
+    response = stub.RegisterEnvironmentReady(request)
+    assert response.success and re.search(
+        r"Successfully marked environment as ready for task graph 'Q4\[task-graph-0\]@1'",
+        response.message,
+    )
+
+    time.sleep(3)
+
+    # Get placements for the task
+    request = erdos_scheduler_pb2.GetPlacementsRequest(
+        timestamp=1234567890,
+        id="task-graph-0",
+    )
+    response = stub.GetPlacements(request)
+    assert response.success
+    actual_task_ids = set()
+    for placement in response.placements:
+        assert (
+            placement.worker_id == "1234" and placement.application_id == "task-graph-0"
+        )
+        actual_task_ids.add(placement.task_id)
+    assert actual_task_ids == {0, 1}
+
+    # Wait for 3 seconds and trigger notify task completion for tasks 0 and 1
+    time.sleep(3)
+
+    request = erdos_scheduler_pb2.NotifyTaskCompletionRequest(
+        application_id="task-graph-0", task_id=0, timestamp=1234567890
+    )
+    response = stub.NotifyTaskCompletion(request)
+    assert response.success
+
+    request = erdos_scheduler_pb2.NotifyTaskCompletionRequest(
+        application_id="task-graph-0", task_id=1, timestamp=1234567890
+    )
+    response = stub.NotifyTaskCompletion(request)
+    assert response.success
+
+    # Wait for 20s to allow the service to execute task completion for fastest task
+    time.sleep(20)
+
+    # Attempt to incorrectly notify task completion for task 3, which hasnt started yet
+    request = erdos_scheduler_pb2.NotifyTaskCompletionRequest(
+        application_id="task-graph-0", task_id=3, timestamp=1234567890
+    )
+    response = stub.NotifyTaskCompletion(request)
+    assert not response.success
+
+    # Wait 2s to allow the service to process the incorrect task completion
+    time.sleep(2)
+
+    # Wait for 25s to allow the service to finish execution of task 0
+    time.sleep(25)
+
+    # This will unlock task 2, which should now be returned as a placement
+    request = erdos_scheduler_pb2.GetPlacementsRequest(
+        timestamp=1234567890,
+        id="task-graph-0",
+    )
+    response = stub.GetPlacements(request)
+    assert response.success
+    actual_task_ids = set()
+    for placement in response.placements:
+        assert (
+            placement.worker_id == "1234" and placement.application_id == "task-graph-0"
+        )
+        actual_task_ids.add(placement.task_id)
+    assert actual_task_ids == {2}
+
+    # Attempt to register the second TaskGraph, wont be able to run due to inadequate resources
+    request = erdos_scheduler_pb2.RegisterTaskGraphRequest(
+        id="task-graph-1",
+        name="TPCH Query 4 50 200",
+        timestamp=1234567890,
+        dependencies=[
+            {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]},
+            {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]},
+            {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]},
+            {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]},
+            {"key": {"id": 4, "name": "stage 4"}, "children_ids": []},
+        ],
+    )
+    response = stub.RegisterTaskGraph(request)
+    assert (
+        not response.success
+        and re.search(
+            r"The worker Pool cannot accomodate the task graph 'task-graph-1'",
+            response.message,
+        )
+        and response.num_executors == 0
+    )
+    
+    # Register the third TaskGraph, will run but will get cancelled due to deadline miss
+    request = erdos_scheduler_pb2.RegisterTaskGraphRequest(
+        id="task-graph-2",
+        name="TPCH Query 4 50 50",
+        timestamp=1234567890,
+        dependencies=[
+            {"key": {"id": 0, "name": "stage 0"}, "children_ids": [2]},
+            {"key": {"id": 1, "name": "stage 1"}, "children_ids": [2]},
+            {"key": {"id": 2, "name": "stage 2"}, "children_ids": [3]},
+            {"key": {"id": 3, "name": "stage 3"}, "children_ids": [4]},
+            {"key": {"id": 4, "name": "stage 4"}, "children_ids": []},
+        ],
+    )
+    response = stub.RegisterTaskGraph(request)
+    assert (
+        response.success
+        and re.search(
+            r"Registered task graph 'task-graph-2' successfully",
+            response.message,
+        )
+        and response.num_executors == 10
+    )
+
+    # Introduce a 2s delay in getting the env ready
+    time.sleep(2)
+
+    # Mark the environment as ready
+    request = erdos_scheduler_pb2.RegisterEnvironmentReadyRequest(
+        id="task-graph-2",
+        num_executors=10,
+        timestamp=1234567890,
+    )
+    response = stub.RegisterEnvironmentReady(request)
+    assert response.success and re.search(
+        r"Successfully marked environment as ready for task graph 'Q4\[task-graph-2\]@1'",
+        response.message,
+    )
+
+    # Wait for 10s to get the placements for the second task graph
+    time.sleep(10)
+
+    # Get placements for the taskgraph 3, one of first two root vertices should be placed since there are resources
+    request = erdos_scheduler_pb2.GetPlacementsRequest(
+        timestamp=1234567890,
+        id="task-graph-2",
+    )
+    response = stub.GetPlacements(request)
+    assert response.success
+    actual_task_ids = set()
+    for placement in response.placements:
+        assert (
+            placement.worker_id == "1234" and placement.application_id == "task-graph-2"
+        )
+        actual_task_ids.add(placement.task_id)
+    assert actual_task_ids == {1}
+
+    # Wait for 100 more seconds and request placements again
+    time.sleep(100)
+
+    # Notify task completion for task 2 in task graph 0 to trigger scheduler run again
+    request = erdos_scheduler_pb2.NotifyTaskCompletionRequest(
+        application_id="task-graph-0", task_id=2, timestamp=1234567890
+    )
+    response = stub.NotifyTaskCompletion(request)
+    assert response.success
+
+    # Wait for 2 seconds to allow scheduler to process task completion and run scheduler
+    time.sleep(2)
+
+    # Get placements for the task, entire taskgraph would be cancelled since deadline has passed.
+    # Since one root vertex (1) is running, the other root vertex (0) will be cancelled first,
+    # then the subsequent vertices.
+    # NOTE: The service will wait until all running/ scheduled tasks complete and are removed 
+    # from the workerpool before issuing a terminate=True for the taskgraph. Until then it will 
+    # return current placements for a taskgraph (including those already sent by the service) 
+    # and wait for running tasks to finish. Spark will ignore it. 
+    request = erdos_scheduler_pb2.GetPlacementsRequest(
+        timestamp=1234567890,
+        id="task-graph-2",
+    )
+    response = stub.GetPlacements(request)
+    assert response.success
+    actual_task_ids = set()
+    # Will return placement for task_id 1
+    for placement in response.placements:
+        if placement.task_id == 1:
+            assert (
+                placement.worker_id == "1234" 
+                and placement.application_id == "task-graph-2" 
+            )
+        actual_task_ids.add(placement.task_id)
+    assert actual_task_ids == {1}
+    
+    # Wait for 5s to issue notify task completion for task_id 1 in task-graph-2
+    time.sleep(5)
+    request = erdos_scheduler_pb2.NotifyTaskCompletionRequest(
+        application_id="task-graph-2", task_id=1, timestamp=1234567890
+    )
+    response = stub.NotifyTaskCompletion(request)
+    assert response.success
+    
+    # Wait for 5s to allow the simulator to process the event. 
+    # Invoke get placements again for task-graph 2, it should return terminate=True now
+    time.sleep(5)
+    request = erdos_scheduler_pb2.GetPlacementsRequest(
+        timestamp=1234567890,
+        id="task-graph-2",
+    )
+    response = stub.GetPlacements(request)
+    assert response.success
+    actual_task_ids = set()
+    # Will return placement for task_id 1
+    for placement in response.placements:
+        actual_task_ids.add(placement.task_id)
+    assert len(actual_task_ids) == 0
+    assert response.terminate == True
+
+    # Deregister framework
+    request = erdos_scheduler_pb2.DeregisterFrameworkRequest(
+        name="test_framework", uri="http://localhost/test", timestamp=1234567890
+    )
+    response = stub.DeregisterFramework(request)
+    assert response.success and re.search(
+        r"Successfully deregistered the framework at http://localhost/test",
+        response.message,
+    )
+
+    channel.close()
diff --git a/utils.py b/utils.py
index 7e8f2814..1fa8adb0 100644
--- a/utils.py
+++ b/utils.py
@@ -93,24 +93,28 @@ def to_unchecked(self, unit: Unit) -> Tuple[float, Unit]:
         return self.time * self.unit.to(unit), unit
 
     def fuzz(
-        self, variance: Tuple[int, int], bounds: Tuple[int, int] = (0, sys.maxsize)
+            self, variance: Tuple[int, int], bounds: Tuple[int, int] = (0, sys.maxsize), rng: random.Random = None
     ) -> "EventTime":
         """Fuzz the time according to the provided `variance` and within the bounds.
 
         Args:
             variance (`Tuple[int, int]`): The (minimum, maximum) % variance to fuzz by.
             bounds (`Tuple[int, int]`): The (minimum, maximum) bounds to fuzz within.
+            rng (random.Random): The random number generator to use.  Defaults to an internal RNG if none is specified.
 
         Returns:
             The fuzzed time according to the given variance.
         """
+        if rng is None:
+            rng = type(self)._rng
+
         min_variance, max_variance = variance
         min_bound, max_bound = bounds
         fuzzed_time = max(
             min_bound,
             min(
                 max_bound,
-                type(self)._rng.uniform(
+                rng.uniform(
                     self.time * abs(min_variance) / 100.0,
                     self.time * abs(max_variance) / 100.0,
                 ),
diff --git a/workload/jobs.py b/workload/jobs.py
index e0acb2e4..97b4966a 100644
--- a/workload/jobs.py
+++ b/workload/jobs.py
@@ -805,12 +805,23 @@ def _generate_task_graph(
             resolve_conditionals = False
             task_logger = setup_logging(name="Task")
 
+        # Create an RNG to be used when fuzzing deadlines, seeded by
+        # the TaskGraph name and the global random seed, if provided.
+        # This ensures that deadlines are deterministic, which is
+        # needed for simulator/Spark parity.
+        deadline_rng = random.Random(
+            (str(_flags.random_seed) if _flags else "") + task_graph_name
+        )
+
         # Generate the deadline for all the Tasks.
         # TODO (Sukrit): Right now, this assumes that all Tasks in the TaskGraph come
         # with the same deadline. At some point, we will have to implement a
         # heuristic-based deadline splitting technique.
+        
+        # NOTE: The taskgraph deadline is re-generated (and overwritten) after 
+        # use_branch_predicated_deadlines code, since fuzz is invoked again there.
         task_deadline = release_time + self.completion_time.fuzz(
-            deadline_variance, deadline_bounds
+            deadline_variance, deadline_bounds, rng=deadline_rng
         )
 
         # Generate all the `Task`s from the `Job`s in the graph.
@@ -883,8 +894,10 @@ def _generate_task_graph(
         else:
             weighted_task_graph_length = self.__get_completion_time()
 
+        # NOTE: This is the second time the deadline is being set, based on a second 
+        # invocation of fuzz.
         task_graph_deadline = release_time + weighted_task_graph_length.fuzz(
-            deadline_variance, deadline_bounds
+            deadline_variance, deadline_bounds, rng=deadline_rng
         )
         if _flags and _flags.decompose_deadlines:
             stages_info = {}
diff --git a/workload/tasks.py b/workload/tasks.py
index 48691ae7..c929f696 100644
--- a/workload/tasks.py
+++ b/workload/tasks.py
@@ -53,7 +53,7 @@ class Task(object):
     Args:
         name (`str`): The name of the computation (typically the callback of
             the ERDOS operator.
-        task_graph_name (`str`): The name of the TaskGraph that this Task belongs to.
+        task_graph (`str`): The name of the TaskGraph that this Task belongs to.
         job (`Job`): The job that created this particular task.
         deadline (`EventTime`): The absolute deadline by which the task should complete.
         profile (`WorkProfile`): A profile of the computation that the Task is supposed
@@ -224,6 +224,7 @@ def schedule(
         self._state = TaskState.SCHEDULED
         self._scheduling_time = time
         self._scheduler_placement = placement
+        self._start_time = placement.placement_time
         self._worker_pool_id = placement.worker_pool_id
         self.update_remaining_time(placement.execution_strategy.runtime)