|
| 1 | +# Managing temporary resources |
| 2 | + |
| 3 | +Sometimes you have workloads that depend on resources that need to be setup, |
| 4 | +used, then cleaned up. Examples might include: |
| 5 | + |
| 6 | +- Compute resources like Dask or Spark clusters |
| 7 | +- Cloud resources like Virtual machines or Kubernetes deployments |
| 8 | +- Docker containers |
| 9 | +- ... |
| 10 | + |
| 11 | +To simplify this pattern, Prefect provides a |
| 12 | +[`ResourceManager`](/core/task_library/resource_manager.html) object for |
| 13 | +encapsulating the setup and cleanup tasks for resources. The most common way |
| 14 | +to define a `ResourceManager` is using the `resource_manager` decorator. This |
| 15 | +decorator wraps a class with the following methods: |
| 16 | + |
| 17 | +- `__init__`: Initializes the resource manager with whatever arguments are needed. |
| 18 | +- `setup`: Creates the resource. Takes no arguments, and may optionally |
| 19 | + return a value that can be used by downstream tasks. |
| 20 | +- `cleanup`: Cleans up the resource. Takes the result of `setup` as an argument. |
| 21 | + |
| 22 | +```python |
| 23 | +from prefect import resource_manager |
| 24 | + |
| 25 | +@resource_manager |
| 26 | +class MyResource: |
| 27 | + def __init__(self, ...): |
| 28 | + """Initialize the resource manager. |
| 29 | +
|
| 30 | + This should store any values required by the setup and cleanup steps. |
| 31 | + """ |
| 32 | + ... |
| 33 | + |
| 34 | + def setup(self): |
| 35 | + """Setup the resource. |
| 36 | +
|
| 37 | + The result of this method can be used in downstream tasks. |
| 38 | + """ |
| 39 | + ... |
| 40 | + |
| 41 | + def cleanup(self, resource): |
| 42 | + """Cleanup the resource. |
| 43 | +
|
| 44 | + This receives the result of `setup`, and is always called if `setup` |
| 45 | + succeeds, even if other upstream tasks failed. |
| 46 | + """ |
| 47 | + ... |
| 48 | +``` |
| 49 | + |
| 50 | +The resulting `ResourceManager` can then be used when building a `Flow` as a |
| 51 | +context-manager around tasks that rely on that resource. The resource will be |
| 52 | +created upon entering the context block, and will be cleaned up upon exiting |
| 53 | +the block, even if tasks contained inside the context fail. |
| 54 | + |
| 55 | +```python |
| 56 | +with Flow("example") as flow: |
| 57 | + with MyResource(...) as resource: |
| 58 | + some_task(resource) |
| 59 | + other_task(resource) |
| 60 | +``` |
| 61 | + |
| 62 | +`ResourceManager` objects are intended for defining resources where the cleanup |
| 63 | +of the resource should also be monitored and managed by Prefect as a `Task` in |
| 64 | +your `Flow`. This is good for things that can be expensive, like cloud |
| 65 | +resources. For things where a failure to cleanup an object isn't detrimental |
| 66 | +(like e.g. a `boto` client) you may be better off relying on other patterns. |
| 67 | + |
| 68 | +## Example: Creating a temporary Dask Cluster |
| 69 | + |
| 70 | +Here we provide a full example for using a `ResourceManager` to setup and |
| 71 | +cleanup a temporary [Dask](https://dask.org) cluster. |
| 72 | + |
| 73 | +:::: tabs |
| 74 | +::: tab "Functional API" |
| 75 | +```python |
| 76 | +from prefect import Flow, task, resource_manager, Parameter |
| 77 | +import dask |
| 78 | +from dask.distributed import Client |
| 79 | + |
| 80 | +@resource_manager |
| 81 | +class DaskCluster: |
| 82 | + """Create a temporary dask cluster. |
| 83 | +
|
| 84 | + Args: |
| 85 | + - n_workers (int, optional): The number of workers to start. |
| 86 | + """ |
| 87 | + def __init__(self, n_workers=None): |
| 88 | + self.n_workers = n_workers |
| 89 | + |
| 90 | + def setup(self): |
| 91 | + """Create a temporary dask cluster, returning the `Client`""" |
| 92 | + return Client(n_workers=self.n_workers) |
| 93 | + |
| 94 | + def cleanup(self, client): |
| 95 | + """Shutdown the temporary dask cluster""" |
| 96 | + client.close() |
| 97 | + |
| 98 | +@task |
| 99 | +def load_data(): |
| 100 | + """Load some data""" |
| 101 | + return dask.datasets.timeseries() |
| 102 | + |
| 103 | +@task |
| 104 | +def summarize(df, client): |
| 105 | + """Compute a summary on the data""" |
| 106 | + return df.describe().compute() |
| 107 | + |
| 108 | +@task |
| 109 | +def write_csv(df, path): |
| 110 | + """Write the summary to disk as a csv""" |
| 111 | + return df.to_csv(path, index_label="index") |
| 112 | + |
| 113 | + |
| 114 | +with Flow("dask-example") as flow: |
| 115 | + n_workers = Parameter("n_workers", default=None) |
| 116 | + out_path = Parameter("out_path", default="summary.csv") |
| 117 | + |
| 118 | + with DaskCluster(n_workers=n_workers) as client: |
| 119 | + # These tasks rely on a dask cluster to run, so we create them inside |
| 120 | + # the `DaskCluster` resource manager |
| 121 | + df = load_data() |
| 122 | + summary = summarize(df, client) |
| 123 | + |
| 124 | + # This task doesn't rely on the dask cluster to run, so it doesn't need to |
| 125 | + # be under the `DaskCluster` context |
| 126 | + write_csv(summary, out_path) |
| 127 | +``` |
| 128 | +::: |
| 129 | + |
| 130 | +::: tab "Imperative API" |
| 131 | +```python |
| 132 | +from prefect import Flow, Task, resource_manager, Parameter |
| 133 | +import dask |
| 134 | +from dask.distributed import Client |
| 135 | + |
| 136 | +@resource_manager |
| 137 | +class DaskCluster: |
| 138 | + """Create a temporary dask cluster. |
| 139 | +
|
| 140 | + Args: |
| 141 | + - n_workers (int, optional): The number of workers to start. |
| 142 | + """ |
| 143 | + def __init__(self, n_workers=None): |
| 144 | + self.n_workers = n_workers |
| 145 | + |
| 146 | + def setup(self): |
| 147 | + """Create a temporary dask cluster, returning the `Client`""" |
| 148 | + return Client(n_workers=self.n_workers) |
| 149 | + |
| 150 | + def cleanup(self, client): |
| 151 | + """Shutdown the temporary dask cluster""" |
| 152 | + client.close() |
| 153 | + |
| 154 | +class LoadData(Task): |
| 155 | + """Load some data""" |
| 156 | + def run(self): |
| 157 | + return dask.datasets.timeseries() |
| 158 | + |
| 159 | +class Summarize(Task): |
| 160 | + """Compute a summary on the data""" |
| 161 | + def run(self, df, client): |
| 162 | + return df.describe().compute() |
| 163 | + |
| 164 | +class WriteCSV(Task): |
| 165 | + """Write the summary to disk as a csv""" |
| 166 | + def run(self, df, path): |
| 167 | + return df.to_csv(path, index_label="index") |
| 168 | + |
| 169 | + |
| 170 | +flow = Flow("dask-example") |
| 171 | +n_workers = Parameter("n_workers", default=None) |
| 172 | +out_path = Parameter("out_path", default="summary.csv") |
| 173 | + |
| 174 | +with DaskCluster(n_workers=n_workers, flow=flow) as client: |
| 175 | + # These tasks rely on a dask cluster to run, so we create them inside |
| 176 | + # the `DaskCluster` resource manager |
| 177 | + df = LoadData() |
| 178 | + summary = Summarize().bind(df, client, flow=flow) |
| 179 | + |
| 180 | +# This task doesn't rely on the dask cluster to run, so it doesn't need to |
| 181 | +# be under the `DaskCluster` context |
| 182 | +WriteCSV().bind(summary, out_path, flow=flow) |
| 183 | +``` |
| 184 | +::: |
| 185 | +:::: |
0 commit comments