Skip to content

Commit e1e9134

Browse files
committed
Merge branch 'master' into qa
2 parents ef1e398 + 0aea2f5 commit e1e9134

File tree

19 files changed

+349
-101
lines changed

19 files changed

+349
-101
lines changed

.github/actions/prepare-breadbox/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ runs:
1212
- name: "Install and configure Poetry"
1313
uses: snok/install-poetry@v1
1414
with:
15-
version: 2.2.1
15+
version: 1.8.2
1616
virtualenvs-create: true
1717
virtualenvs-in-project: true
1818

breadbox-client/bump_version_and_publish.py

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python3
2+
# force build
23
"""
34
Breadbox Version Bumping and Publishing Script
45
=============================================
@@ -29,8 +30,8 @@
2930
import re
3031
import argparse
3132

32-
33-
VERSION_TAG_PATTERN="breadbox-(\\d+.\\d+.\\d+)"
33+
TAG_PREFIX = "breadbox-"
34+
VERSION_TAG_PATTERN=TAG_PREFIX + "(\\d+.\\d+.\\d+)"
3435
IGNORE_CONVENTIONAL_COMMIT_TYPES = ["build", "chore", "ci", "docs", "style", "refactor", "perf", "test"]
3536
PATCH_CONVENTIONAL_COMMIT_TYPES = ["fix", "revert"]
3637
MINOR_CONVENTIONAL_COMMIT_TYPES = ["feat"]
@@ -69,30 +70,24 @@ def main():
6970
print("Starting version bump process...")
7071
bump_rules = []
7172

72-
last_commit = None
73-
last_version = None
74-
print("Analyzing git history for version tags and conventional commits...")
75-
for commit_hash, version, bump_rule, commit_subject in get_sem_versions_and_bumps():
76-
if bump_rule is not None:
77-
bump_rules.append((commit_hash, commit_subject, bump_rule))
73+
last_version = get_last_sem_version()
74+
if last_version is None:
75+
raise AssertionError("No previous version tag found. Cannot proceed without a base version.")
7876

79-
if last_commit is None:
80-
last_commit = commit_hash
77+
# Create tag name from last version
78+
last_tag = f"{TAG_PREFIX}{'.'.join(map(str, last_version))}"
8179

82-
if version is not None:
83-
last_version = version
84-
print(f"Found version tag: {'.'.join(map(str, version))}")
85-
break
80+
print(f"Looking at git history starting at {last_tag} for conventional commits...")
8681

87-
if last_version is None:
88-
raise AssertionError("No previous version tag found. Cannot proceed without a base version.")
82+
for commit_hash, bump_rule, commit_subject in get_bumps(last_tag):
83+
bump_rules.append((commit_hash, commit_subject, bump_rule))
8984

9085
if len(bump_rules) == 0:
9186
print(
9287
f"No changes found which require updating version")
9388
return
9489

95-
print(f"Applying {len(bump_rules)} version bump rules, starting with {last_version} to generate version for {last_commit}...")
90+
print(f"Applying {len(bump_rules)} version bump rules, starting with {last_version}")
9691
bump_rules.reverse()
9792
for commit_hash, commit_subject, bump_rule in bump_rules:
9893
old_version = last_version
@@ -126,8 +121,9 @@ def update_version_in_files(version_str, dryrun=False):
126121
content = file.read()
127122

128123
# Update version using regex
129-
updated_content = re.sub(r'version\s*=\s*"[^"]+"', f'version = "{version_str}"', content)
130-
124+
updated_content = re.sub(r'^version\s*=\s*"[^"]+"$', f'version = "{version_str}"', content, flags=re.MULTILINE)
125+
assert updated_content != content, "Version should have changed, but result after substituting was the same"
126+
131127
# Write updated content back
132128
with open(filename, 'w') as file:
133129
file.write(updated_content)
@@ -165,14 +161,14 @@ def update_version_in_files(version_str, dryrun=False):
165161
print(" DRY RUN: Would commit version changes")
166162

167163
def tag_repo(version_str):
168-
tag_name = f"breadbox-{version_str}"
164+
tag_name = f"{TAG_PREFIX}{version_str}"
169165
print(f" Creating git tag: {tag_name}...")
170166
try:
171167
# Create an annotated tag
172168
subprocess.run(["git", "tag", "-a", tag_name, "-m", f"Release {version_str}"], check=True)
173169
# Push the tag to remote
174170
print(" Pushing tag to remote...")
175-
subprocess.run(["git", "push", "origin", tag_name], check=True)
171+
subprocess.run(["git", "push", "origin", tag_name, "master"], check=True)
176172
except Exception as e:
177173
print(f"Error tagging repository: {str(e)}")
178174
raise
@@ -186,7 +182,8 @@ def publish():
186182

187183
# Build and publish the package
188184
print(" Building and publishing package...")
189-
subprocess.run(["poetry", "publish", "--build", "--repository", "public-python"], check=True)
185+
subprocess.run(["poetry", "build"], check=True)
186+
subprocess.run(["poetry", "publish", "--repository", "public-python"], check=True)
190187
print(" Package published successfully!")
191188
except Exception as e:
192189
print(f"Error publishing package: {str(e)}")
@@ -204,36 +201,50 @@ def rule_from_conventional_commit_type(commit_type, is_breaking):
204201
else:
205202
return None
206203

207-
def get_sem_versions_and_bumps():
208-
print(" Retrieving git commit history...")
204+
def get_last_sem_version():
209205
try:
210-
output = subprocess.check_output(
211-
["git", "log", "--pretty=format:%H%x09%s%x09%D"],
206+
# Get all tags
207+
tags_output = subprocess.check_output(
208+
["git", "tag"],
212209
text=True
213-
)
210+
).strip().split('\n')
211+
212+
# Get highest version from tags
213+
highest_version = to_sem_version(tags_output)
214214
except Exception as e:
215215
print(f"Error retrieving git history: {str(e)}")
216216
raise
217+
return highest_version
217218

218-
for line in output.splitlines():
219-
commit_hash, subject, refs = line.split("\t", 2)
220-
tags = [r.strip() for r in refs.split(",") if r.strip().startswith("tag: ")]
221-
tags = [t.replace("tag: ", "") for t in tags]
222-
223-
version = to_sem_version(tags)
219+
def get_bumps(last_tag):
220+
print(" Retrieving git commit history...")
221+
222+
223+
# Get commits from HEAD to the last version tag
224+
commit_output = subprocess.check_output(
225+
["git", "log", f"{last_tag}..HEAD", "--pretty=format:%H%x09%s"],
226+
text=True
227+
)
224228

225-
bump_rule = rule_from_conventional_commit(subject)
226-
if bump_rule is not None or version is not None:
227-
# print(f" Found conventional commit at {commit_hash[:8]}: {subject}")
228-
yield commit_hash, version, bump_rule, subject
229+
# Then yield all conventional commits
230+
for line in commit_output.splitlines():
231+
parts = line.split("\t", 1)
232+
if len(parts) == 2:
233+
commit_hash, subject = parts
234+
bump_rule = rule_from_conventional_commit(subject)
235+
if bump_rule is not None:
236+
yield commit_hash, bump_rule, subject
229237

230238
def to_sem_version(tags):
231239
"""
232-
given a list of tags, extract the semantic version number using VERSION_TAG_PATTERN. If there are multiple, returns the max.
233-
If there are no tags or none match, returns None.
240+
Given a list of tags, extract the semantic version numbers using VERSION_TAG_PATTERN.
241+
Returns the highest version found, or None if no valid version tags exist.
234242
"""
235243
versions = []
236244
for tag in tags:
245+
if not tag: # Skip empty tags
246+
continue
247+
237248
match = re.match(VERSION_TAG_PATTERN, tag)
238249
if match:
239250
# Extract version number from the tag using regex group

breadbox-client/pyproject.toml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,24 @@
11
[tool.poetry]
22
name = "breadbox-client"
3-
version = "4.3.3"
3+
version = "4.3.19"
44
description = "A client library for accessing Breadbox"
55

66
authors = []
77

88
readme = "README.md"
99
packages = [
10-
{include = "breadbox_client"},
1110
{include = "breadbox_facade"},
1211
]
13-
include = ["CHANGELOG.md", "breadbox_client/py.typed", "breadbox_client/**/*"]
12+
13+
include = [
14+
{ path = "CHANGELOG.md", format = ["sdist", "wheel"] },
15+
# because the breadbox_client directory is generated, it's listed in .gitignore
16+
# however, this causes poetry to also ignore these files. There doesn't seem to
17+
# be a way to override that behavior so explictly tell it to add all the files
18+
# to the wheel and sdist files
19+
{ path = "breadbox_client/py.typed", format = ["sdist", "wheel"] },
20+
{ path = "breadbox_client/**/*.py", format = ["sdist", "wheel"] }
21+
]
1422

1523
[tool.poetry.dependencies]
1624
python = "^3.9"

breadbox/breadbox/service/search.py

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from typing import Dict, List
44

55
import pandas as pd
6+
from sqlalchemy import insert
7+
68
from ..crud.metadata import cast_tabular_cell_value_type
79
from ..crud import dimension_types as types_crud
810

@@ -33,11 +35,12 @@ class PropertyValuePair:
3335
class MetadataCacheEntry:
3436
properties_to_index_df: pd.DataFrame
3537
columns_metadata: Dict[str, ColumnMetadata]
36-
label_by_given_id: Dict[str, str]
37-
rows_by_index: Dict[str, Dict[str, str]]
38+
39+
def get_label_for_given_id(self, given_id):
40+
return self.properties_to_index_df.loc[given_id, "label"]
3841

3942
def get_properties_dict(self, given_id: str):
40-
return self.rows_by_index.get(given_id)
43+
return self.properties_to_index_df.loc[given_id].to_dict()
4144

4245

4346
class MetadataCache:
@@ -74,19 +77,13 @@ def get(self, dimension_type_name: str):
7477

7578
columns_metadata = dict(dimension_type.dataset.columns_metadata)
7679

77-
label_by_given_id = get_dimension_type_metadata_col(
78-
self.db, dimension_type_name=dimension_type.name, col_name="label"
79-
)
80-
8180
rows_by_index = {}
8281
for record in properties_to_index_df.to_records():
8382
rows_by_index[record.index] = record
8483

8584
entry = MetadataCacheEntry(
8685
properties_to_index_df=properties_to_index_df,
8786
columns_metadata=columns_metadata,
88-
label_by_given_id=label_by_given_id,
89-
rows_by_index=rows_by_index,
9087
)
9188
self.cache[dimension_type_name] = entry
9289

@@ -140,36 +137,46 @@ def refresh_search_index_for_dimension_type(
140137
_delete_search_index_records(db, dimension_type)
141138
log.info("_delete_search_index_records complete")
142139

143-
dimension_search_index_rows = []
144-
145140
cache_entry = metadata_cache.get(dimension_type.name)
146141

147-
for given_id in cache_entry.properties_to_index_df.index:
148-
for record in get_property_value_pairs_for_given_id(
149-
db=db,
150-
dimension_type_name=dimension_type.name,
151-
given_id=given_id,
152-
metadata_cache=metadata_cache,
153-
):
154-
# if given_id in cache_entry.dimension_id_by_given_id:
155-
dimension_search_index_rows.append(
156-
DimensionSearchIndex(
157-
# dimension_id=cache_entry.dimension_id_by_given_id[given_id],
142+
def row_generator():
143+
for given_id in cache_entry.properties_to_index_df.index:
144+
for record in get_property_value_pairs_for_given_id(
145+
db=db,
146+
dimension_type_name=dimension_type.name,
147+
given_id=given_id,
148+
metadata_cache=metadata_cache,
149+
):
150+
# if given_id in cache_entry.dimension_id_by_given_id:
151+
yield dict(
158152
property=record.property,
159153
value=record.value,
160154
group_id=dimension_type.dataset.group_id,
161155
dimension_type_name=dimension_type.name,
162156
dimension_given_id=given_id,
163-
label=cache_entry.label_by_given_id[given_id],
157+
label=cache_entry.get_label_for_given_id(given_id),
164158
)
165-
)
159+
160+
dimension_search_index_row_count = 0
161+
for batch in _make_batches(row_generator(), batch_size=1000):
162+
db.execute(insert(DimensionSearchIndex), batch)
163+
dimension_search_index_row_count += len(batch)
164+
f"Wrote batch of {len(batch)} search index records for {dimension_type_name}"
166165

167166
log.info(
168-
f"refresh_search_index_for_dimension_type generated {len(dimension_search_index_rows)} search index records for {len(cache_entry.properties_to_index_df.index)} rows in {dimension_type_name}. Writing..."
167+
f"Finished writing all {(dimension_search_index_row_count)} search index records for {len(cache_entry.properties_to_index_df.index)} rows in {dimension_type_name}"
169168
)
170169

171-
db.bulk_save_objects(dimension_search_index_rows)
172-
log.info("refresh_search_index_for_dimension_type complete")
170+
171+
def _make_batches(iterable, batch_size):
172+
batch = []
173+
for item in iterable:
174+
batch.append(item)
175+
if len(batch) >= batch_size:
176+
yield batch
177+
batch = []
178+
if len(batch) > 0:
179+
yield batch
173180

174181

175182
def _get_datatypes_referencing(db, dimension_type_name):

breadbox/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "breadbox"
3-
version = "4.3.3"
3+
version = "4.3.19"
44
description = "A peristent service that stores and fetches datasets"
55
authors = ["Jessica Cheng <[email protected]>"]
66
packages = [{include = "breadbox"}]

0 commit comments

Comments
 (0)