broadinstitute
diff --git a/‎.github/actions/prepare-breadbox/action.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/prepare-breadbox/action.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎breadbox-client/bump_version_and_publish.py‎
Lines changed: 50 additions & 39 deletions b/‎breadbox-client/bump_version_and_publish.py‎
Lines changed: 50 additions & 39 deletions
diff --git a/‎breadbox-client/pyproject.toml‎
Lines changed: 11 additions & 3 deletions b/‎breadbox-client/pyproject.toml‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎breadbox/breadbox/service/search.py‎
Lines changed: 34 additions & 27 deletions b/‎breadbox/breadbox/service/search.py‎
Lines changed: 34 additions & 27 deletions
diff --git a/‎breadbox/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎breadbox/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
@@ -12,7 +12,7 @@ runs:
     - name: "Install and configure Poetry"
       uses: snok/install-poetry@v1
       with:
-        version: 2.2.1
+        version: 1.8.2
         virtualenvs-create: true
         virtualenvs-in-project: true
 
 
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# force build
 """
 Breadbox Version Bumping and Publishing Script
 =============================================
@@ -29,8 +30,8 @@
 import re
 import argparse
 
-
-VERSION_TAG_PATTERN="breadbox-(\\d+.\\d+.\\d+)"
+TAG_PREFIX = "breadbox-"
+VERSION_TAG_PATTERN=TAG_PREFIX + "(\\d+.\\d+.\\d+)"
 IGNORE_CONVENTIONAL_COMMIT_TYPES = ["build", "chore", "ci", "docs", "style", "refactor", "perf", "test"]
 PATCH_CONVENTIONAL_COMMIT_TYPES = ["fix", "revert"]
 MINOR_CONVENTIONAL_COMMIT_TYPES = ["feat"]
@@ -69,30 +70,24 @@ def main():
     print("Starting version bump process...")
     bump_rules = []
 
-    last_commit = None
-    last_version = None
-    print("Analyzing git history for version tags and conventional commits...")
-    for commit_hash, version, bump_rule, commit_subject in get_sem_versions_and_bumps():
-        if bump_rule is not None:
-            bump_rules.append((commit_hash, commit_subject, bump_rule))
+    last_version = get_last_sem_version()
+    if last_version is None:
+        raise AssertionError("No previous version tag found. Cannot proceed without a base version.")
 
-        if last_commit is None:
-            last_commit = commit_hash
+    # Create tag name from last version
+    last_tag = f"{TAG_PREFIX}{'.'.join(map(str, last_version))}"
 
-        if version is not None:
-            last_version = version
-            print(f"Found version tag: {'.'.join(map(str, version))}")
-            break
+    print(f"Looking at git history starting at {last_tag} for conventional commits...")
 
-    if last_version is None:
-        raise AssertionError("No previous version tag found. Cannot proceed without a base version.")
+    for commit_hash, bump_rule, commit_subject in get_bumps(last_tag):
+        bump_rules.append((commit_hash, commit_subject, bump_rule))
 
     if len(bump_rules) == 0:
         print(
             f"No changes found which require updating version")
         return
 
-    print(f"Applying {len(bump_rules)} version bump rules, starting with {last_version} to generate version for {last_commit}...")
+    print(f"Applying {len(bump_rules)} version bump rules, starting with {last_version}")
     bump_rules.reverse()
     for commit_hash, commit_subject, bump_rule in bump_rules:
         old_version = last_version
@@ -126,8 +121,9 @@ def update_version_in_files(version_str, dryrun=False):
                 content = file.read()
 
             # Update version using regex
-            updated_content = re.sub(r'version\s*=\s*"[^"]+"', f'version = "{version_str}"', content)
-            
+            updated_content = re.sub(r'^version\s*=\s*"[^"]+"$', f'version = "{version_str}"', content, flags=re.MULTILINE)
+            assert updated_content != content, "Version should have changed, but result after substituting was the same"
+
             # Write updated content back
             with open(filename, 'w') as file:
                 file.write(updated_content)
@@ -165,14 +161,14 @@ def update_version_in_files(version_str, dryrun=False):
         print("  DRY RUN: Would commit version changes")
 
 def tag_repo(version_str):
-    tag_name = f"breadbox-{version_str}"
+    tag_name = f"{TAG_PREFIX}{version_str}"
     print(f"  Creating git tag: {tag_name}...")
     try:
         # Create an annotated tag
         subprocess.run(["git", "tag", "-a", tag_name, "-m", f"Release {version_str}"], check=True)
         # Push the tag to remote
         print("  Pushing tag to remote...")
-        subprocess.run(["git", "push", "origin", tag_name], check=True)
+        subprocess.run(["git", "push", "origin", tag_name, "master"], check=True)
     except Exception as e:
         print(f"Error tagging repository: {str(e)}")
         raise
@@ -186,7 +182,8 @@ def publish():
 
         # Build and publish the package
         print("  Building and publishing package...")
-        subprocess.run(["poetry", "publish", "--build", "--repository", "public-python"], check=True)
+        subprocess.run(["poetry", "build"], check=True)
+        subprocess.run(["poetry", "publish", "--repository", "public-python"], check=True)
         print("  Package published successfully!")
     except Exception as e:
         print(f"Error publishing package: {str(e)}")
@@ -204,36 +201,50 @@ def rule_from_conventional_commit_type(commit_type, is_breaking):
     else:
         return None
 
-def get_sem_versions_and_bumps():
-    print("  Retrieving git commit history...")
+def get_last_sem_version():
     try:
-        output = subprocess.check_output(
-            ["git", "log", "--pretty=format:%H%x09%s%x09%D"],
+        # Get all tags
+        tags_output = subprocess.check_output(
+            ["git", "tag"],
             text=True
-        )
+        ).strip().split('\n')
+
+        # Get highest version from tags
+        highest_version = to_sem_version(tags_output)
     except Exception as e:
         print(f"Error retrieving git history: {str(e)}")
         raise
+    return highest_version
 
-    for line in output.splitlines():
-        commit_hash, subject, refs = line.split("\t", 2)
-        tags = [r.strip() for r in refs.split(",") if r.strip().startswith("tag: ")]
-        tags = [t.replace("tag: ", "") for t in tags]
-        
-        version = to_sem_version(tags)
+def get_bumps(last_tag):
+    print("  Retrieving git commit history...")
+    
+
+    # Get commits from HEAD to the last version tag
+    commit_output = subprocess.check_output(
+        ["git", "log", f"{last_tag}..HEAD", "--pretty=format:%H%x09%s"],
+        text=True
+    )
 
-        bump_rule = rule_from_conventional_commit(subject)
-        if bump_rule is not None or version is not None:
-            # print(f"  Found conventional commit at {commit_hash[:8]}: {subject}")
-            yield commit_hash, version, bump_rule, subject
+    # Then yield all conventional commits
+    for line in commit_output.splitlines():
+        parts = line.split("\t", 1)
+        if len(parts) == 2:
+            commit_hash, subject = parts
+            bump_rule = rule_from_conventional_commit(subject)
+            if bump_rule is not None:
+                yield commit_hash, bump_rule, subject
 
 def to_sem_version(tags):
     """
-    given a list of tags, extract the semantic version number using VERSION_TAG_PATTERN. If there are multiple, returns the max.
-    If there are no tags or none match, returns None.
+    Given a list of tags, extract the semantic version numbers using VERSION_TAG_PATTERN.
+    Returns the highest version found, or None if no valid version tags exist.
     """
     versions = []
     for tag in tags:
+        if not tag:  # Skip empty tags
+            continue
+            
         match = re.match(VERSION_TAG_PATTERN, tag)
         if match:
             # Extract version number from the tag using regex group
 
@@ -1,16 +1,24 @@
 [tool.poetry]
 name = "breadbox-client"
-version = "4.3.3"
+version = "4.3.19"
 description = "A client library for accessing Breadbox"
 
 authors = []
 
 readme = "README.md"
 packages = [
-    {include = "breadbox_client"},
     {include = "breadbox_facade"},
 ]
-include = ["CHANGELOG.md", "breadbox_client/py.typed", "breadbox_client/**/*"]
+
+include = [
+    { path = "CHANGELOG.md", format = ["sdist", "wheel"] },
+    # because the breadbox_client directory is generated, it's listed in .gitignore
+    # however, this causes poetry to also ignore these files. There doesn't seem to
+    # be a way to override that behavior so explictly tell it to add all the files
+    # to the wheel and sdist files
+    { path = "breadbox_client/py.typed", format = ["sdist", "wheel"] },
+    { path = "breadbox_client/**/*.py", format = ["sdist", "wheel"] }
+]
 
 [tool.poetry.dependencies]
 python = "^3.9"
 
@@ -3,6 +3,8 @@
 from typing import Dict, List
 
 import pandas as pd
+from sqlalchemy import insert
+
 from ..crud.metadata import cast_tabular_cell_value_type
 from ..crud import dimension_types as types_crud
 
@@ -33,11 +35,12 @@ class PropertyValuePair:
 class MetadataCacheEntry:
     properties_to_index_df: pd.DataFrame
     columns_metadata: Dict[str, ColumnMetadata]
-    label_by_given_id: Dict[str, str]
-    rows_by_index: Dict[str, Dict[str, str]]
+
+    def get_label_for_given_id(self, given_id):
+        return self.properties_to_index_df.loc[given_id, "label"]
 
     def get_properties_dict(self, given_id: str):
-        return self.rows_by_index.get(given_id)
+        return self.properties_to_index_df.loc[given_id].to_dict()
 
 
 class MetadataCache:
@@ -74,19 +77,13 @@ def get(self, dimension_type_name: str):
 
                 columns_metadata = dict(dimension_type.dataset.columns_metadata)
 
-                label_by_given_id = get_dimension_type_metadata_col(
-                    self.db, dimension_type_name=dimension_type.name, col_name="label"
-                )
-
             rows_by_index = {}
             for record in properties_to_index_df.to_records():
                 rows_by_index[record.index] = record
 
             entry = MetadataCacheEntry(
                 properties_to_index_df=properties_to_index_df,
                 columns_metadata=columns_metadata,
-                label_by_given_id=label_by_given_id,
-                rows_by_index=rows_by_index,
             )
             self.cache[dimension_type_name] = entry
 
@@ -140,36 +137,46 @@ def refresh_search_index_for_dimension_type(
         _delete_search_index_records(db, dimension_type)
         log.info("_delete_search_index_records complete")
 
-    dimension_search_index_rows = []
-
     cache_entry = metadata_cache.get(dimension_type.name)
 
-    for given_id in cache_entry.properties_to_index_df.index:
-        for record in get_property_value_pairs_for_given_id(
-            db=db,
-            dimension_type_name=dimension_type.name,
-            given_id=given_id,
-            metadata_cache=metadata_cache,
-        ):
-            # if given_id in cache_entry.dimension_id_by_given_id:
-            dimension_search_index_rows.append(
-                DimensionSearchIndex(
-                    # dimension_id=cache_entry.dimension_id_by_given_id[given_id],
+    def row_generator():
+        for given_id in cache_entry.properties_to_index_df.index:
+            for record in get_property_value_pairs_for_given_id(
+                db=db,
+                dimension_type_name=dimension_type.name,
+                given_id=given_id,
+                metadata_cache=metadata_cache,
+            ):
+                # if given_id in cache_entry.dimension_id_by_given_id:
+                yield dict(
                     property=record.property,
                     value=record.value,
                     group_id=dimension_type.dataset.group_id,
                     dimension_type_name=dimension_type.name,
                     dimension_given_id=given_id,
-                    label=cache_entry.label_by_given_id[given_id],
+                    label=cache_entry.get_label_for_given_id(given_id),
                 )
-            )
+
+    dimension_search_index_row_count = 0
+    for batch in _make_batches(row_generator(), batch_size=1000):
+        db.execute(insert(DimensionSearchIndex), batch)
+        dimension_search_index_row_count += len(batch)
+        f"Wrote batch of {len(batch)} search index records for {dimension_type_name}"
 
     log.info(
-        f"refresh_search_index_for_dimension_type generated {len(dimension_search_index_rows)} search index records for {len(cache_entry.properties_to_index_df.index)} rows in {dimension_type_name}. Writing..."
+        f"Finished writing all {(dimension_search_index_row_count)} search index records for {len(cache_entry.properties_to_index_df.index)} rows in {dimension_type_name}"
     )
 
-    db.bulk_save_objects(dimension_search_index_rows)
-    log.info("refresh_search_index_for_dimension_type complete")
+
+def _make_batches(iterable, batch_size):
+    batch = []
+    for item in iterable:
+        batch.append(item)
+        if len(batch) >= batch_size:
+            yield batch
+            batch = []
+    if len(batch) > 0:
+        yield batch
 
 
 def _get_datatypes_referencing(db, dimension_type_name):
 
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "breadbox"
-version = "4.3.3"
+version = "4.3.19"
 description = "A peristent service that stores and fetches datasets"
 authors = ["Jessica Cheng <[email protected]>"]
 packages = [{include = "breadbox"}]