EpistasisLab
diff --git a/‎.gitignore‎
Lines changed: 9 additions & 0 deletions b/‎.gitignore‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Tutorial/1_Estimators_Overview.ipynb‎
Lines changed: 5 additions & 5 deletions b/‎Tutorial/1_Estimators_Overview.ipynb‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎Tutorial/7_dask_parallelization.ipynb‎
Lines changed: 158 additions & 13 deletions b/‎Tutorial/7_dask_parallelization.ipynb‎
Lines changed: 158 additions & 13 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,9 @@
+*.pyc
+.pytest_cache/
+TPOT2.egg-info
+*.tar.gz
+*.pkl
+*.json
+joblib/
+cache_folder/
+dask-worker-space/
@@ -1,6 +1,6 @@
 # TPOT2 ALPHA
 
-TPOT2 is a rewrite of TPOT with some additional functionality.
+TPOT2 is a rewrite of TPOT with some additional functionality. Notably, we added support for graph-based pipelines and additional parameters to better specify the desired search space. 
 TPOT2 is currently in Alpha. This means that there will likely be some backwards incompatible changes to the API as we develop. Some implemented features may be buggy. There is a list of known issues written at the bottom of this README. Some features have placeholder names or are listed as "Experimental" in the doc string. These are features that may not be fully implemented and may or may work with all other features.
 
 If you are interested in using the current stable release of TPOT, you can do that here: [https://github.com/EpistasisLab/tpot/](https://github.com/EpistasisLab/tpot/). 
 
@@ -58,7 +58,7 @@
     "                            early_stop=5, #how many generations with no improvement to stop after\n",
     "                            \n",
     "                            #List of other objective functions. All objective functions take in an untrained GraphPipeline and return a score or a list of scores\n",
-    "                            other_objective_functions= [ tpot2.estimator_objective_functions.average_path_length_objective,  tpot2.estimator_objective_functions.complexity_objective],\n",
+    "                            other_objective_functions= [ tpot2.estimator_objective_functions.average_path_length_objective,  tpot2.estimator_objective_functions.number_of_nodes_objective],\n",
     "                            \n",
     "                            #List of weights for the other objective functions. Must be the same length as other_objective_functions. By default, bigger is better is set to True. \n",
     "                            other_objective_functions_weights=[-1, -1],\n",
@@ -120,7 +120,7 @@
        "      <th></th>\n",
        "      <th>roc_auc_score</th>\n",
        "      <th>average_path_length_objective</th>\n",
-       "      <th>complexity_objective</th>\n",
+       "      <th>number_of_nodes_objective</th>\n",
        "      <th>Parents</th>\n",
        "      <th>Variation_Function</th>\n",
        "      <th>Individual</th>\n",
@@ -268,7 +268,7 @@
        "</div>"
       ],
       "text/plain": [
-       "    roc_auc_score average_path_length_objective complexity_objective Parents  \\\n",
+       "    roc_auc_score average_path_length_objective number_of_nodes_objective Parents  \\\n",
        "0        0.997698                           1.0                  1.0     NaN   \n",
        "1        0.949345                           1.0                  1.0     NaN   \n",
        "2        0.983175                           1.0                  1.0     NaN   \n",
@@ -361,7 +361,7 @@
        "      <th></th>\n",
        "      <th>roc_auc_score</th>\n",
        "      <th>average_path_length_objective</th>\n",
-       "      <th>complexity_objective</th>\n",
+       "      <th>number_of_nodes_objective</th>\n",
        "      <th>Parents</th>\n",
        "      <th>Variation_Function</th>\n",
        "      <th>Individual</th>\n",
@@ -760,7 +760,7 @@
        "</div>"
       ],
       "text/plain": [
-       "    roc_auc_score average_path_length_objective complexity_objective  \\\n",
+       "    roc_auc_score average_path_length_objective number_of_nodes_objective  \\\n",
        "0        0.997698                           1.0                  1.0   \n",
        "25       0.997698                           1.0                  1.0   \n",
        "30       0.997698                           1.0                  1.0   \n",
 
@@ -5,25 +5,86 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "TODO: Advanced Dask parallelization for HPC"
+    "# Parallelization\n",
+    "\n",
+    "TPOT2 uses the Dask package for parallelization either locally (dask.destributed.LocalCluster) or multi-node via a job schedule (dask-jobqueue). \n",
+    "\n",
+    "## Local Machine Parallelization\n",
+    "\n",
+    "TPOT2 can be easily parallelized on a local computer by setting the n_jobs and memory_limit parameters.\n",
+    "\n",
+    "`n_jobs` dictates how many dask workers to launch. In TPOT2 this corresponds to the number of pipelines to evaluate in parallel.\n",
+    "\n",
+    "`memory_limit` is the amount of RAM to use per worker. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tpot2\n",
+    "import sklearn\n",
+    "import sklearn.datasets\n",
+    "import numpy as np\n",
+    "scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n",
+    "X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
+    "\n",
+    "\n",
+    "est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
+    "est.fit(X_train, y_train)\n",
+    "print(scorer(est, X_test, y_test))"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Dask Dashboard\n",
+    "## Manual Dask Clients and Dashboard\n",
+    "\n",
+    "You can also manually initialize a dask client. This can be useful to gain additional control over the parallelization, debugging, as well as viewing a dashboard of the live performance of TPOT2.\n",
+    "\n",
+    "You can find more details in the official [documentation here.](https://docs.dask.org/en/stable/)\n",
+    "\n",
+    "\n",
+    "[Dask Python Tutorial](https://docs.dask.org/en/stable/deploying-python.html)\n",
+    "[Dask Dashboard](https://docs.dask.org/en/stable/dashboard.html)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Initializing a basic dask local cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client, LocalCluster\n",
+    "\n",
+    "n_jobs = 4\n",
+    "memory_limit = \"4GB\"\n",
     "\n",
-    "https://docs.dask.org/en/stable/dashboard.html"
+    "cluster = LocalCluster(n_workers=n_jobs, #if no client is passed in and no global client exists, create our own\n",
+    "                        threads_per_worker=1,\n",
+    "                        memory_limit=memory_limit)\n",
+    "client = Client(cluster)"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Click the link to get to a live dashboard"
+    "Get the link to view the dask Dashboard. "
    ]
   },
   {
@@ -32,18 +93,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#TODO\n",
-    "from dask.distributed import Client\n",
-    "client = Client()  # start distributed scheduler locally.\n",
-    "client"
+    " client.dashboard_link"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Dask single node"
+    "Pass into TPOT to Train.\n",
+    "Note that the if a client is passed in manually, TPOT will ignore n_jobs and memory_limit.\n",
+    "If there is no client passed in, TPOT will ignore any global/existing client and create its own."
    ]
   },
   {
@@ -52,15 +112,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#TODO"
+    "est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client verbose=1)\n",
+    "# this is equivalent to: \n",
+    "# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
+    "est.fit(X_train, y_train)\n",
+    "print(scorer(est, X_test, y_test))\n",
+    "\n",
+    "#It is good to close the client and cluster when you are done with them\n",
+    "client.close()\n",
+    "cluster.close()"
    ]
   },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Dask multiple nodes"
+    "Option 2\n",
+    "\n",
+    "You can initialize the cluster and client with a context manager that will automatically close them. "
    ]
   },
   {
@@ -69,7 +139,82 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#TODO"
+    "from dask.distributed import Client, LocalCluster\n",
+    "import tpot2\n",
+    "import sklearn\n",
+    "import sklearn.datasets\n",
+    "import numpy as np\n",
+    "\n",
+    "scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n",
+    "X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
+    "\n",
+    "\n",
+    "n_jobs = 4\n",
+    "memory_limit = \"4GB\"\n",
+    "\n",
+    "with LocalCluster(  \n",
+    "    n_workers=n_jobs,\n",
+    "    threads_per_worker=1,\n",
+    "    memory_limit='4GB',\n",
+    ") as cluster, Client(cluster) as client:\n",
+    "    est = tpot2.TPOTClassifier(population_size= 8, generations=5, client=client, verbose=1)\n",
+    "    est.fit(X_train, y_train)\n",
+    "    print(scorer(est, X_test, y_test))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dask multi node parallelization\n",
+    "\n",
+    "Dask can parallelize across multiple nodes via job queueing systems. This is done using the dask-jobqueue package. More information can be found in the official [documentation here.]( https://jobqueue.dask.org/en/latest/)\n",
+    "\n",
+    "To parallelize TPOT2 with dask-jobqueue, simply pass in a client based on a jobqueue cluster with desired settings into the client parameter. Each job will evaluate a single pipeline.\n",
+    "\n",
+    "Note that TPOT will ignore n_jobs and memory_limit as these should be set inside the dask cluster. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask.distributed import Client, LocalCluster\n",
+    "import sklearn\n",
+    "import sklearn.datasets\n",
+    "import sklearn.metrics\n",
+    "import sklearn.model_selection\n",
+    "import tpot2\n",
+    "\n",
+    "from dask_jobqueue import SGECluster # or SLURMCluster, PBSCluster, etc. Replace SGE with your scheduler.\n",
+    "cluster = SGECluster(\n",
+    "    queue='all.q',\n",
+    "    cores=2,\n",
+    "    memory=\"50 GB\"\n",
+    "\n",
+    ")\n",
+    "\n",
+    "cluster.adapt(minimum_jobs=10, maximum_jobs=100)  # auto-scale between 10 and 100 jobs\n",
+    "\n",
+    "client = Client(cluster)\n",
+    "\n",
+    "scorer = sklearn.metrics.get_scorer('roc_auc_ovr')\n",
+    "X, y = sklearn.datasets.load_digits(return_X_y=True)\n",
+    "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)\n",
+    "\n",
+    "est = tpot2.TPOTClassifier(population_size= 100, generations=5, client=client, verbose=1)\n",
+    "# this is equivalent to: \n",
+    "# est = tpot2.TPOTClassifier(population_size= 8, generations=5, n_jobs=4, memory_limit=\"4GB\", verbose=1)\n",
+    "est.fit(X_train, y_train)\n",
+    "print(scorer(est, X_test, y_test))\n",
+    "\n",
+    "#It is good to close the client and cluster when you are done with them\n",
+    "client.close()\n",
+    "cluster.close()"
    ]
   }
  ],
@@ -89,7 +234,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.10.10"
   },
   "orig_nbformat": 4,
   "vscode": {
 
@@ -46,6 +46,7 @@ def calculate_version():
                       'dask>=2023.3.1',
                       'distributed>=2023.3.1',
                       'dask-ml>=2022.5.27',
+                      'dask-jobqueue>=0.8.1',
                       'func_timeout>=4.3.5',
                      ],
     extras_require={