Skip to content

Commit 70231e9

Browse files
committed
Work around lack of iterator support in django-cachalot
See also noripyt/django-cachalot#256
1 parent 6b44767 commit 70231e9

File tree

10 files changed

+199
-171
lines changed

10 files changed

+199
-171
lines changed

isic/core/services/__init__.py

+97-86
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import operator
44
from typing import Any
55

6+
from cachalot.api import cachalot_disabled
67
from django.db.models import Func
78
from django.db.models.aggregates import Count
89
from django.db.models.query import QuerySet
@@ -73,69 +74,73 @@ def staff_image_metadata_csv(
7374
+ sorted([f"unstructured.{key}" for key in used_unstructured_metadata_keys])
7475
)
7576

76-
# Note this uses .values because populating django ORM objects is very slow, and doing this on
77-
# large querysets can add ~5s per 100k images to the request time.
78-
for image in (
79-
qs.order_by("isic_id")
80-
.values(
81-
"accession__original_blob_name",
82-
"isic_id",
83-
"accession__cohort_id",
84-
"accession__cohort__name",
85-
"accession__cohort__attribution",
86-
"accession__copyright_license",
87-
"public",
88-
*[f"accession__{key}" for key in used_metadata_keys],
89-
*[f"accession__{field.csv_field_name}" for field in Accession.remapped_internal_fields],
90-
*[f"accession__{field.input_field_name}" for field in Accession.computed_fields],
91-
*[
92-
f"accession__{field.relation_name}__{field.internal_id_name}"
93-
for field in Accession.remapped_internal_fields
94-
],
95-
"accession__unstructured_metadata__value",
96-
)
97-
.iterator()
98-
):
99-
value = {
100-
"original_filename": image["accession__original_blob_name"],
101-
"isic_id": image["isic_id"],
102-
"cohort_id": image["accession__cohort_id"],
103-
"cohort": image["accession__cohort__name"],
104-
"attribution": image["accession__cohort__attribution"],
105-
"copyright_license": image["accession__copyright_license"],
106-
"public": image["public"],
107-
**{
108-
k.replace("accession__", ""): v
109-
for k, v in image.items()
110-
if k.replace("accession__", "") in Accession.metadata_keys()
111-
},
112-
**{
113-
field.internal_id_name: image[
77+
with cachalot_disabled():
78+
# Note this uses .values because populating django ORM objects is very slow, and doing this
79+
# on large querysets can add ~5s per 100k images to the request time.
80+
for image in (
81+
qs.order_by("isic_id")
82+
.values(
83+
"accession__original_blob_name",
84+
"isic_id",
85+
"accession__cohort_id",
86+
"accession__cohort__name",
87+
"accession__cohort__attribution",
88+
"accession__copyright_license",
89+
"public",
90+
*[f"accession__{key}" for key in used_metadata_keys],
91+
*[
92+
f"accession__{field.csv_field_name}"
93+
for field in Accession.remapped_internal_fields
94+
],
95+
*[f"accession__{field.input_field_name}" for field in Accession.computed_fields],
96+
*[
11497
f"accession__{field.relation_name}__{field.internal_id_name}"
115-
]
116-
for field in Accession.remapped_internal_fields
117-
},
118-
**{
119-
field.csv_field_name: image[f"accession__{field.csv_field_name}"]
120-
for field in Accession.remapped_internal_fields
121-
},
122-
**{
123-
f"unstructured.{k}": v
124-
for k, v in image["accession__unstructured_metadata__value"].items()
125-
},
126-
}
127-
128-
for field in Accession.computed_fields:
129-
computed_output_fields = field.transformer(
130-
image[f"accession__{field.input_field_name}"]
131-
if image.get(f"accession__{field.input_field_name}")
132-
else None
98+
for field in Accession.remapped_internal_fields
99+
],
100+
"accession__unstructured_metadata__value",
133101
)
134-
135-
if computed_output_fields:
136-
value.update(computed_output_fields)
137-
138-
yield value
102+
.iterator()
103+
):
104+
value = {
105+
"original_filename": image["accession__original_blob_name"],
106+
"isic_id": image["isic_id"],
107+
"cohort_id": image["accession__cohort_id"],
108+
"cohort": image["accession__cohort__name"],
109+
"attribution": image["accession__cohort__attribution"],
110+
"copyright_license": image["accession__copyright_license"],
111+
"public": image["public"],
112+
**{
113+
k.replace("accession__", ""): v
114+
for k, v in image.items()
115+
if k.replace("accession__", "") in Accession.metadata_keys()
116+
},
117+
**{
118+
field.internal_id_name: image[
119+
f"accession__{field.relation_name}__{field.internal_id_name}"
120+
]
121+
for field in Accession.remapped_internal_fields
122+
},
123+
**{
124+
field.csv_field_name: image[f"accession__{field.csv_field_name}"]
125+
for field in Accession.remapped_internal_fields
126+
},
127+
**{
128+
f"unstructured.{k}": v
129+
for k, v in image["accession__unstructured_metadata__value"].items()
130+
},
131+
}
132+
133+
for field in Accession.computed_fields:
134+
computed_output_fields = field.transformer(
135+
image[f"accession__{field.input_field_name}"]
136+
if image.get(f"accession__{field.input_field_name}")
137+
else None
138+
)
139+
140+
if computed_output_fields:
141+
value.update(computed_output_fields)
142+
143+
yield value
139144

140145

141146
def image_metadata_csv(
@@ -167,28 +172,34 @@ def image_metadata_csv(
167172
fieldnames = headers + sorted(used_metadata_keys)
168173
yield fieldnames
169174

170-
# Note this uses .values because populating django ORM objects is very slow, and doing this on
171-
# large querysets can add ~5s per 100k images to the request time.
172-
for image in (
173-
qs.order_by("isic_id")
174-
.values(
175-
"isic_id",
176-
"accession__cohort__attribution",
177-
"accession__copyright_license",
178-
*[f"accession__{key}" for key in Accession.metadata_keys()],
179-
*[f"accession__{field.csv_field_name}" for field in Accession.remapped_internal_fields],
180-
)
181-
.iterator()
182-
):
183-
image = {k.replace("accession__", ""): v for k, v in image.items()} # noqa: PLW2901
184-
185-
image["attribution"] = image.pop("cohort__attribution")
186-
187-
for computed_field in Accession.computed_fields:
188-
if image[computed_field.input_field_name]:
189-
computed_fields = computed_field.transformer(image[computed_field.input_field_name])
190-
if computed_fields:
191-
image.update(computed_fields)
192-
del image[computed_field.input_field_name]
193-
194-
yield {k: v for k, v in image.items() if k in fieldnames}
175+
with cachalot_disabled():
176+
# Note this uses .values because populating django ORM objects is very slow, and doing this
177+
# on large querysets can add ~5s per 100k images to the request time.
178+
for image in (
179+
qs.order_by("isic_id")
180+
.values(
181+
"isic_id",
182+
"accession__cohort__attribution",
183+
"accession__copyright_license",
184+
*[f"accession__{key}" for key in Accession.metadata_keys()],
185+
*[
186+
f"accession__{field.csv_field_name}"
187+
for field in Accession.remapped_internal_fields
188+
],
189+
)
190+
.iterator()
191+
):
192+
image = {k.replace("accession__", ""): v for k, v in image.items()} # noqa: PLW2901
193+
194+
image["attribution"] = image.pop("cohort__attribution")
195+
196+
for computed_field in Accession.computed_fields:
197+
if image[computed_field.input_field_name]:
198+
computed_fields = computed_field.transformer(
199+
image[computed_field.input_field_name]
200+
)
201+
if computed_fields:
202+
image.update(computed_fields)
203+
del image[computed_field.input_field_name]
204+
205+
yield {k: v for k, v in image.items() if k in fieldnames}

isic/core/services/collection/image.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import itertools
22

3+
from cachalot.api import cachalot_disabled
34
from django.contrib.auth.models import User
45
from django.core.exceptions import ValidationError
56
from django.db import transaction
@@ -31,7 +32,7 @@ def collection_add_images(
3132
if collection.public and qs.private().exists():
3233
raise ValidationError("Can't add private images to a public collection.")
3334

34-
with transaction.atomic():
35+
with transaction.atomic(), cachalot_disabled():
3536
CollectionImageM2M = Collection.images.through # noqa: N806
3637
for image_batch in itertools.batched(qs.iterator(), 5_000):
3738
# ignore_conflicts is necessary to make this method idempotent (consistent with

isic/core/services/image/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import itertools
22

3+
from cachalot.api import cachalot_disabled
34
from django.contrib.auth.models import User
45
from django.db import transaction
56
from django.db.models import QuerySet
@@ -30,7 +31,7 @@ def image_share(
3031
if image:
3132
qs = Image.objects.filter(pk=image.pk)
3233

33-
with transaction.atomic():
34+
with transaction.atomic(), cachalot_disabled():
3435
ImageShareM2M = Image.shares.through # noqa: N806
3536
for image_batch in itertools.batched(qs.iterator(), 5_000):
3637
# ignore_conflicts is necessary to make this method idempotent. ignore_conflicts only

isic/core/tasks.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from typing import cast
66
import uuid
77

8+
from cachalot.api import cachalot_disabled
89
from celery import shared_task
910
from django.conf import settings
1011
from django.contrib.auth.models import User
@@ -59,7 +60,8 @@ def share_collection_with_users_task(collection_pk: int, grantor_pk: int, user_p
5960
retry_kwargs={"max_retries": 15},
6061
)
6162
def sync_elasticsearch_index_task():
62-
bulk_add_to_search_index(Image.objects.with_elasticsearch_properties().iterator())
63+
with cachalot_disabled():
64+
bulk_add_to_search_index(Image.objects.with_elasticsearch_properties().iterator())
6365

6466

6567
@shared_task(soft_time_limit=1800, time_limit=1810)

isic/ingest/admin.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from datetime import UTC, datetime
33
import logging
44

5+
from cachalot.api import cachalot_disabled
56
from django.contrib import admin
67
from django.contrib.humanize.templatetags.humanize import intcomma
78
from django.db import models
@@ -136,17 +137,18 @@ def export_file_mapping(self, request, queryset):
136137

137138
writer.writeheader()
138139
for cohort in queryset.select_related("contributor"):
139-
for accession in cohort.accessions.values(
140-
"original_blob_name",
141-
"image__isic_id",
142-
).iterator():
143-
d = {
144-
"contributor": cohort.contributor.institution_name,
145-
"cohort": cohort.name,
146-
"filename": accession["original_blob_name"],
147-
"isic_id": accession["image__isic_id"],
148-
}
149-
writer.writerow(d)
140+
with cachalot_disabled():
141+
for accession in cohort.accessions.values(
142+
"original_blob_name",
143+
"image__isic_id",
144+
).iterator():
145+
d = {
146+
"contributor": cohort.contributor.institution_name,
147+
"cohort": cohort.name,
148+
"filename": accession["original_blob_name"],
149+
"isic_id": accession["image__isic_id"],
150+
}
151+
writer.writerow(d)
150152
return response
151153

152154

isic/ingest/services/cohort/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22

3+
from cachalot.api import cachalot_disabled
34
from django.contrib.auth.models import User
45
from django.core.exceptions import ValidationError
56
from django.db import transaction
@@ -56,7 +57,7 @@ def cohort_publish(
5657
)
5758

5859
# this creates a transaction
59-
with lock_table_for_writes(IsicId):
60+
with lock_table_for_writes(IsicId), cachalot_disabled():
6061
for accession in cohort.accessions.publishable().iterator():
6162
image = image_create(creator=publisher, accession=accession, public=public)
6263
collection_add_images(collection=cohort.collection, image=image, ignore_lock=True)

isic/ingest/tasks.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import itertools
33
import time
44

5+
from cachalot.api import cachalot_disabled
56
from celery import shared_task
67
from celery.exceptions import SoftTimeLimitExceeded
78
from celery.utils.log import get_task_logger
@@ -63,12 +64,13 @@ def extract_zip_task(zip_pk: int):
6364
# rmq can only handle ~500msg/s - throttle significantly in places
6465
# where we could be putting many messages onto the queue at once.
6566
def generate_blobs():
66-
for accession_id in throttled_iterator(
67-
zip_upload.accessions.values_list("id", flat=True).iterator()
68-
):
69-
# avoid .delay since we want to avoid putting tens of thousands of elements
70-
# into the transaction.on_commit list.
71-
accession_generate_blob_task.apply_async(args=[accession_id])
67+
with cachalot_disabled():
68+
for accession_id in throttled_iterator(
69+
zip_upload.accessions.values_list("id", flat=True).iterator()
70+
):
71+
# avoid .delay since we want to avoid putting tens of thousands of elements
72+
# into the transaction.on_commit list.
73+
accession_generate_blob_task.apply_async(args=[accession_id])
7274

7375
transaction.on_commit(generate_blobs)
7476

isic/ingest/utils/metadata.py

+10-7
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import itertools
55
from typing import Any
66

7+
from cachalot.api import cachalot_disabled
78
from django.forms.models import ModelForm
89
from isic_metadata.metadata import IGNORE_RCM_MODEL_CHECKS, MetadataBatch, MetadataRow
910
from pydantic import ValidationError as PydanticValidationError
@@ -50,11 +51,12 @@ def validate_csv_format_and_filenames(
5051
)
5152
)
5253

53-
matching_accessions = set(
54-
Accession.objects.filter(cohort=cohort, original_blob_name__in=filenames.keys())
55-
.values_list("original_blob_name", flat=True)
56-
.iterator()
57-
)
54+
with cachalot_disabled():
55+
matching_accessions = set(
56+
Accession.objects.filter(cohort=cohort, original_blob_name__in=filenames.keys())
57+
.values_list("original_blob_name", flat=True)
58+
.iterator()
59+
)
5860

5961
unknown_images = set(filenames.keys()) - matching_accessions
6062
if unknown_images:
@@ -188,7 +190,8 @@ def accession_values_to_metadata_dict(accession_values: dict[str, Any]) -> dict[
188190
else:
189191
yield row
190192

191-
for row in accessions.exclude(original_blob_name__in=yielded_filenames).iterator():
192-
yield accession_values_to_metadata_dict(row)
193+
with cachalot_disabled():
194+
for row in accessions.exclude(original_blob_name__in=yielded_filenames).iterator():
195+
yield accession_values_to_metadata_dict(row)
193196

194197
return _validate_df_consistency(cohort_df_merged_metadata_rows())

0 commit comments

Comments
 (0)