man-group · andreytaboola · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -220,7 +220,7 @@ jobs:
       PYTHON_VERSION: "3_6"
       CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_6
       CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_6
-      VERSION: 0.6.3
+      VERSION: 0.6.4
       PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
       YARN_STATIC_DIR: notebooker/web/static/
       IMAGE_NAME: mangroup/notebooker
@@ -236,7 +236,7 @@ jobs:
     environment:
       CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_7
       CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_7
-      VERSION: 0.6.3
+      VERSION: 0.6.4
       PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
       YARN_STATIC_DIR: notebooker/web/static/
       IMAGE_NAME: mangroup/notebooker
@@ -250,7 +250,7 @@ jobs:
     environment:
       CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_8
       CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_8
-      VERSION: 0.6.3
+      VERSION: 0.6.4
       PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
       YARN_STATIC_DIR: notebooker/web/static/
       IMAGE_NAME: mangroup/notebooker
@@ -264,7 +264,7 @@ jobs:
     environment:
       CIRCLE_ARTIFACTS: /tmp/circleci-artifacts/3_11
       CIRCLE_TEST_REPORTS: /tmp/circleci-test-results/3_11
-      VERSION: 0.6.3
+      VERSION: 0.6.4
       PANDOC_RELEASES_URL: https://github.com/jgm/pandoc/releases
       YARN_STATIC_DIR: notebooker/web/static/
       IMAGE_NAME: mangroup/notebooker

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,16 @@
+0.6.4 (2024-10-08)
+------------------
+* Feature: Categorization of the notebooks, allowing set special category tag to notebooks for easy grouping
+* This feature solves issues with huge repo's where only limited number of notebooks are used in the webapp:
+    * Very deep navigation tree in the ui for deepest notebook paths
+    * Super long names of the reports in scheduler and results pages
+    * Hard tile navigation for the reports
+* Enabling categorization of the notebooks using special flag:
+    * Add 'category=..' tag to the relevant notebooks metadata
+    * Execute notebooker with --categorization flag
+* Important: only categorized notebooks, those having 'category=..' tag are shown as options to select in the webapp
+* Keeps original navigation by directory structure if categorization flag is not
+
 0.6.3 (2024-07-11)
 ------------------
 * Feature: Flag to preserve original working directory when running notebooks to make local imports and relative paths work.

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -20,7 +20,7 @@ Do also make sure to run the webapp and make sure you haven't broken anything.
 When releasing a new version, please increment the version number in:
 * `notebooker/version.py`
 * `.circleci/config.yml`
-* `docs/config.yml`
+* `docs/conf.py`
 * `notebooker/web/static/package.json`
 
 This build will validate that these numbers match those given in `.circleci/config.yml`.

diff --git a/docs/conf.py b/docs/conf.py
@@ -23,7 +23,7 @@
 author = "Man Group Quant Tech"
 
 # The full version, including alpha/beta/rc tags
-release = "0.6.3"
+release = "0.6.4"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/notebooker/_entrypoints.py b/notebooker/_entrypoints.py
@@ -70,6 +70,12 @@ def filesystem_default_value(dirname):
     is_flag=True,
     help="If selected, notebooker set current working directory to absolute path of the notebook to keep it local context available",
 )
+@click.option(
+    "--categorization",
+    default=False,
+    is_flag=True,
+    help="If selected, discovers only templates with the 'category=example' tags set to any cell and groups notebooks by their category names",
+)
 @click.option(
     "--default-mailfrom", default=DEFAULT_MAILFROM_ADDRESS, help="Set a new value for the default mailfrom setting."
 )
@@ -91,6 +97,7 @@ def base_notebooker(
     py_template_subdir,
     notebooker_disable_git,
     execute_at_origin,
+    categorization,
     default_mailfrom,
     running_timeout,
     serializer_cls,
@@ -106,6 +113,7 @@ def base_notebooker(
         PY_TEMPLATE_SUBDIR=py_template_subdir,
         NOTEBOOKER_DISABLE_GIT=notebooker_disable_git,
         EXECUTE_AT_ORIGIN=execute_at_origin,
+        CATEGORIZATION=categorization,
         DEFAULT_MAILFROM=default_mailfrom,
         RUNNING_TIMEOUT=running_timeout,
     )
@@ -180,6 +188,7 @@ def start_webapp(
 
 @base_notebooker.command()
 @click.option("--report-name", help="The name of the template to execute, relative to the template directory.")
+@click.option("--category", default="", help="Category of the template.")
 @click.option(
     "--overrides-as-json", default="{}", help="The parameters to inject into the notebook template, in JSON format."
 )
@@ -230,6 +239,7 @@ def start_webapp(
 def execute_notebook(
     config: BaseConfig,
     report_name,
+    category,
     overrides_as_json,
     iterate_override_values_of,
     report_title,
@@ -250,6 +260,7 @@ def execute_notebook(
     return execute_notebook_entrypoint(
         config,
         report_name,
+        category,
         overrides_as_json,
         iterate_override_values_of,
         report_title,

diff --git a/notebooker/constants.py b/notebooker/constants.py
@@ -88,6 +88,7 @@ class NotebookResultBase(object):
     mailfrom = attr.ib(default=None)
     email_subject = attr.ib(default=None)
     is_slideshow = attr.ib(default=False)
+    category = attr.ib(default=None)
 
     def saveable_output(self):
         out = attr.asdict(self)
@@ -164,6 +165,7 @@ class NotebookResultComplete(NotebookResultBase):
     scheduler_job_id = attr.ib(default=None)
     mailfrom = attr.ib(default=None)
     is_slideshow = attr.ib(default=False)
+    category = attr.ib(default=None)
 
     def html_resources(self):
         """We have to save the raw images using Mongo GridFS - figure out where they will go here"""
@@ -197,6 +199,7 @@ def saveable_output(self):
             "raw_html": "",  # backwards compatibility for versions<0.3.1
             "mailfrom": self.mailfrom,
             "is_slideshow": self.is_slideshow,
+            "category": self.category,
         }
 
     def __repr__(self):
@@ -205,7 +208,7 @@ def __repr__(self):
             "job_start_time={job_start_time}, job_finish_time={job_finish_time}, update_time={update_time}, "
             "report_title={report_title}, overrides={overrides}, mailto={mailto}, error_mailto={error_mailto}, "
             "mailfrom={mailfrom}, email_subject={email_subject}, generate_pdf_output={generate_pdf_output}, "
-            "hide_code={hide_code}, scheduler_job_id={scheduler_job_id}, is_slideshow={is_slideshow})".format(
+            "hide_code={hide_code}, scheduler_job_id={scheduler_job_id}, is_slideshow={is_slideshow}, category={category})".format(
                 job_id=self.job_id,
                 status=self.status,
                 report_name=self.report_name,
@@ -222,5 +225,6 @@ def __repr__(self):
                 hide_code=self.hide_code,
                 scheduler_job_id=self.scheduler_job_id,
                 is_slideshow=self.is_slideshow,
+                category=self.category,
             )
         )
diff --git a/notebooker/execute_notebook.py b/notebooker/execute_notebook.py
@@ -54,6 +54,7 @@ def _run_checks(
     scheduler_job_id: Optional[str] = None,
     mailfrom: Optional[str] = None,
     is_slideshow: bool = False,
+    category: Optional[str] = None,
 ) -> NotebookResultComplete:
     """
     This is the actual method which executes a notebook, whether running in the webapp or via the entrypoint.
@@ -152,6 +153,7 @@ def _run_checks(
         generate_pdf_output=generate_pdf_output,
         report_name=template_name,
         report_title=report_title,
+        category=category,
         overrides=overrides,
         scheduler_job_id=scheduler_job_id,
         mailfrom=mailfrom,
@@ -164,6 +166,7 @@ def _run_checks(
 def run_report(
     job_submit_time,
     report_name,
+    category,
     overrides,
     result_serializer,
     report_title="",
@@ -222,6 +225,7 @@ def run_report(
             scheduler_job_id=scheduler_job_id,
             mailfrom=mailfrom,
             is_slideshow=is_slideshow,
+            category=category,
         )
         logger.info("Successfully got result.")
         result_serializer.save_check_result(result)
@@ -234,6 +238,7 @@ def run_report(
             job_start_time=job_submit_time,
             report_name=report_name,
             report_title=report_title,
+            category=category,
             error_info=error_info,
             overrides=overrides,
             mailto=mailto,
@@ -257,6 +262,7 @@ def run_report(
             return run_report(
                 job_submit_time,
                 report_name,
+                category,
                 overrides,
                 result_serializer,
                 report_title=report_title,
@@ -351,6 +357,7 @@ def _get_overrides(overrides_as_json: AnyStr, iterate_override_values_of: Option
 def execute_notebook_entrypoint(
     config: BaseConfig,
     report_name: str,
+    category: str,
     overrides_as_json: str,
     iterate_override_values_of: Union[List[str], str],
     report_title: str,
@@ -377,6 +384,7 @@ def execute_notebook_entrypoint(
     start_time = datetime.datetime.now()
     logger.info("Running a report with these parameters:")
     logger.info("report_name = %s", report_name)
+    logger.info("category = %s", category)
     logger.info("overrides_as_json = %s", overrides_as_json)
     logger.info("iterate_override_values_of = %s", iterate_override_values_of)
     logger.info("report_title = %s", report_title)
@@ -407,6 +415,7 @@ def execute_notebook_entrypoint(
         result = run_report(
             start_time,
             report_name,
+            category,
             overrides,
             result_serializer,
             report_title=report_title,
@@ -495,6 +504,7 @@ def run_report_in_subprocess(
     email_subject=None,
     n_retries=3,
     is_slideshow=False,
+    category=None,
 ) -> str:
     """
     Execute the Notebooker report in a subprocess.
@@ -513,6 +523,7 @@ def run_report_in_subprocess(
     :param email_subject: `str` if passed, then this string will be used in the email subject
     :param n_retries: The number of retries to attempt.
     :param is_slideshow: Whether the notebook is a reveal.js slideshow or not.
+    :param category: Category of the notebook
     :return: The unique job_id.
     """
     if error_mailto is None:
@@ -535,6 +546,7 @@ def run_report_in_subprocess(
         is_slideshow=is_slideshow,
         email_subject=email_subject,
         mailfrom=mailfrom,
+        category=category,
     )
 
     command = (
@@ -578,6 +590,7 @@ def run_report_in_subprocess(
         + (["--is-slideshow"] if is_slideshow else [])
         + ([f"--scheduler-job-id={scheduler_job_id}"] if scheduler_job_id is not None else [])
         + ([f"--mailfrom={mailfrom}"] if mailfrom is not None else [])
+        + ([f"--category={category}"] if category is not None else [])
         + ([f"--email-subject={email_subject}"] if email_subject else [])
     )
     p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

diff --git a/notebooker/serialization/mongo.py b/notebooker/serialization/mongo.py
@@ -209,6 +209,7 @@ def save_check_stub(
         is_slideshow: bool = False,
         email_subject: Optional[str] = None,
         mailfrom: Optional[str] = None,
+        category: Optional[str] = None,
     ) -> None:
         """Call this when we are just starting a check. Saves a "pending" job into storage."""
         job_start_time = job_start_time or datetime.datetime.now()
@@ -228,6 +229,7 @@ def save_check_stub(
             scheduler_job_id=scheduler_job_id,
             is_slideshow=is_slideshow,
             mailfrom=mailfrom,
+            category=category,
         )
         self._save_to_db(pending_result)
 
@@ -325,6 +327,7 @@ def _convert_result(
                 scheduler_job_id=result.get("scheduler_job_id", None),
                 is_slideshow=result.get("is_slideshow", False),
                 email_subject=result.get("email_subject", None),
+                category=result.get("category", None),
             )
         elif cls == NotebookResultPending:
             return NotebookResultPending(
@@ -344,6 +347,7 @@ def _convert_result(
                 stdout=result.get("stdout", []),
                 scheduler_job_id=result.get("scheduler_job_id", None),
                 is_slideshow=result.get("is_slideshow", False),
+                category=result.get("category", None),
             )
 
         elif cls == NotebookResultError:
@@ -370,6 +374,7 @@ def _convert_result(
                 stdout=result.get("stdout", []),
                 scheduler_job_id=result.get("scheduler_job_id", False),
                 is_slideshow=result.get("is_slideshow", False),
+                category=result.get("category", None),
             )
         else:
             raise ValueError("Could not deserialise {} into result object.".format(result))
@@ -397,10 +402,17 @@ def _get_result_count(self, base_filter):
 
     def get_count_and_latest_time_per_report(self, subfolder: Optional[str]):
         base_filer = {} if not subfolder else {"report_name": {"$regex": subfolder + ".*"}}
+        return self.fetch_reports(base_filer)
+
+    def get_count_and_latest_time_per_report_per_category(self, category: Optional[str]):
+        base_filer = {} if not category else {"category": category}
+        return self.fetch_reports(base_filer)
+
+    def fetch_reports(self, base_filer: Dict[str, Any]):
         reports = list(
             self._get_raw_results(
                 base_filter=base_filer,
-                projection={"report_name": 1, "job_start_time": 1, "scheduler_job_id": 1, "_id": 0},
+                projection={"report_name": 1, "job_start_time": 1, "scheduler_job_id": 1, "category": 1, "_id": 0},
                 limit=0,
             )
         )
@@ -411,7 +423,12 @@ def get_count_and_latest_time_per_report(self, subfolder: Optional[str]):
         for report, all_runs in jobs_by_name.items():
             latest_start_time = max(r["job_start_time"] for r in all_runs)
             scheduled_runs = len([x for x in all_runs if x.get("scheduler_job_id")])
-            output[report] = {"count": len(all_runs), "latest_run": latest_start_time, "scheduler_runs": scheduled_runs}
+            output[report] = {
+                "count": len(all_runs),
+                "latest_run": latest_start_time,
+                "scheduler_runs": scheduled_runs,
+                "category": r["category"],
-                "category": r["category"],
+                "category": r.get("category"),
-                "category": r["category"],
+                "category": r.get("category"),
+            }
         return output
 
     def get_all_results(

diff --git a/notebooker/settings.py b/notebooker/settings.py
@@ -28,6 +28,9 @@ class BaseConfig:
     # A boolean flag to dictate whether we should execute the notebook at the origin or not.
     EXECUTE_AT_ORIGIN: bool = False
 
+    # A boolean flag to dictate whether we should discover and group notebooker by their category tags.
-    # A boolean flag to dictate whether we should discover and group notebooker by their category tags.
+    # A boolean flag to dictate whether we should discover and group notebook templates by their category tags.
-    # A boolean flag to dictate whether we should discover and group notebooker by their category tags.
+    # A boolean flag to dictate whether we should discover and group notebook templates by their category tags.
+    CATEGORIZATION: bool = False
+
     # The serializer class we are using for storage, e.g. PyMongoResultSerializer
     SERIALIZER_CLS: DEFAULT_SERIALIZER = None
     # The dictionary of parameters which are used to initialize the serializer class above

diff --git a/notebooker/utils/results.py b/notebooker/utils/results.py
@@ -6,7 +6,7 @@
 
 import babel.dates
 import inflection
-from flask import url_for
+from flask import url_for, current_app
 
 from notebooker import constants
 from notebooker.exceptions import NotebookRunException
@@ -140,12 +140,20 @@ def get_all_available_results_json(
 
 
 def get_count_and_latest_time_per_report(serializer: MongoResultSerializer, subfolder: Optional[str] = None):
-    reports = serializer.get_count_and_latest_time_per_report(subfolder)
+    if subfolder and current_app.config["CATEGORIZATION"]:
+        category = subfolder.rstrip("/")
+        reports = serializer.get_count_and_latest_time_per_report_per_category(category)
+    else:
+        reports = serializer.get_count_and_latest_time_per_report(subfolder)
     output = {}
     for report_name, metadata in sorted(reports.items(), key=lambda x: x[1]["latest_run"], reverse=True):
-        metadata["report_name"] = report_name
+        title_name = report_name
+        if "PATH_TO_CATEGORY_DICT" in current_app.config and report_name in current_app.config["PATH_TO_CATEGORY_DICT"]:
+            title_name = current_app.config["PATH_TO_CATEGORY_DICT"][report_name] + "/" + report_name.split("/")[-1]
+        metadata["report_name"] = title_name
+        metadata["original_report"] = report_name
         metadata["time_diff"] = babel.dates.format_timedelta(datetime.datetime.now() - metadata["latest_run"])
-        output[inflection.titleize(report_name)] = metadata
+        output[inflection.titleize(title_name)] = metadata
     return output