|
3 | 3 | import operator
|
4 | 4 | from typing import Any
|
5 | 5 |
|
| 6 | +from cachalot.api import cachalot_disabled |
6 | 7 | from django.db.models import Func
|
7 | 8 | from django.db.models.aggregates import Count
|
8 | 9 | from django.db.models.query import QuerySet
|
@@ -73,69 +74,73 @@ def staff_image_metadata_csv(
|
73 | 74 | + sorted([f"unstructured.{key}" for key in used_unstructured_metadata_keys])
|
74 | 75 | )
|
75 | 76 |
|
76 |
| - # Note this uses .values because populating django ORM objects is very slow, and doing this on |
77 |
| - # large querysets can add ~5s per 100k images to the request time. |
78 |
| - for image in ( |
79 |
| - qs.order_by("isic_id") |
80 |
| - .values( |
81 |
| - "accession__original_blob_name", |
82 |
| - "isic_id", |
83 |
| - "accession__cohort_id", |
84 |
| - "accession__cohort__name", |
85 |
| - "accession__cohort__attribution", |
86 |
| - "accession__copyright_license", |
87 |
| - "public", |
88 |
| - *[f"accession__{key}" for key in used_metadata_keys], |
89 |
| - *[f"accession__{field.csv_field_name}" for field in Accession.remapped_internal_fields], |
90 |
| - *[f"accession__{field.input_field_name}" for field in Accession.computed_fields], |
91 |
| - *[ |
92 |
| - f"accession__{field.relation_name}__{field.internal_id_name}" |
93 |
| - for field in Accession.remapped_internal_fields |
94 |
| - ], |
95 |
| - "accession__unstructured_metadata__value", |
96 |
| - ) |
97 |
| - .iterator() |
98 |
| - ): |
99 |
| - value = { |
100 |
| - "original_filename": image["accession__original_blob_name"], |
101 |
| - "isic_id": image["isic_id"], |
102 |
| - "cohort_id": image["accession__cohort_id"], |
103 |
| - "cohort": image["accession__cohort__name"], |
104 |
| - "attribution": image["accession__cohort__attribution"], |
105 |
| - "copyright_license": image["accession__copyright_license"], |
106 |
| - "public": image["public"], |
107 |
| - **{ |
108 |
| - k.replace("accession__", ""): v |
109 |
| - for k, v in image.items() |
110 |
| - if k.replace("accession__", "") in Accession.metadata_keys() |
111 |
| - }, |
112 |
| - **{ |
113 |
| - field.internal_id_name: image[ |
| 77 | + with cachalot_disabled(): |
| 78 | + # Note this uses .values because populating django ORM objects is very slow, and doing this |
| 79 | + # on large querysets can add ~5s per 100k images to the request time. |
| 80 | + for image in ( |
| 81 | + qs.order_by("isic_id") |
| 82 | + .values( |
| 83 | + "accession__original_blob_name", |
| 84 | + "isic_id", |
| 85 | + "accession__cohort_id", |
| 86 | + "accession__cohort__name", |
| 87 | + "accession__cohort__attribution", |
| 88 | + "accession__copyright_license", |
| 89 | + "public", |
| 90 | + *[f"accession__{key}" for key in used_metadata_keys], |
| 91 | + *[ |
| 92 | + f"accession__{field.csv_field_name}" |
| 93 | + for field in Accession.remapped_internal_fields |
| 94 | + ], |
| 95 | + *[f"accession__{field.input_field_name}" for field in Accession.computed_fields], |
| 96 | + *[ |
114 | 97 | f"accession__{field.relation_name}__{field.internal_id_name}"
|
115 |
| - ] |
116 |
| - for field in Accession.remapped_internal_fields |
117 |
| - }, |
118 |
| - **{ |
119 |
| - field.csv_field_name: image[f"accession__{field.csv_field_name}"] |
120 |
| - for field in Accession.remapped_internal_fields |
121 |
| - }, |
122 |
| - **{ |
123 |
| - f"unstructured.{k}": v |
124 |
| - for k, v in image["accession__unstructured_metadata__value"].items() |
125 |
| - }, |
126 |
| - } |
127 |
| - |
128 |
| - for field in Accession.computed_fields: |
129 |
| - computed_output_fields = field.transformer( |
130 |
| - image[f"accession__{field.input_field_name}"] |
131 |
| - if image.get(f"accession__{field.input_field_name}") |
132 |
| - else None |
| 98 | + for field in Accession.remapped_internal_fields |
| 99 | + ], |
| 100 | + "accession__unstructured_metadata__value", |
133 | 101 | )
|
134 |
| - |
135 |
| - if computed_output_fields: |
136 |
| - value.update(computed_output_fields) |
137 |
| - |
138 |
| - yield value |
| 102 | + .iterator() |
| 103 | + ): |
| 104 | + value = { |
| 105 | + "original_filename": image["accession__original_blob_name"], |
| 106 | + "isic_id": image["isic_id"], |
| 107 | + "cohort_id": image["accession__cohort_id"], |
| 108 | + "cohort": image["accession__cohort__name"], |
| 109 | + "attribution": image["accession__cohort__attribution"], |
| 110 | + "copyright_license": image["accession__copyright_license"], |
| 111 | + "public": image["public"], |
| 112 | + **{ |
| 113 | + k.replace("accession__", ""): v |
| 114 | + for k, v in image.items() |
| 115 | + if k.replace("accession__", "") in Accession.metadata_keys() |
| 116 | + }, |
| 117 | + **{ |
| 118 | + field.internal_id_name: image[ |
| 119 | + f"accession__{field.relation_name}__{field.internal_id_name}" |
| 120 | + ] |
| 121 | + for field in Accession.remapped_internal_fields |
| 122 | + }, |
| 123 | + **{ |
| 124 | + field.csv_field_name: image[f"accession__{field.csv_field_name}"] |
| 125 | + for field in Accession.remapped_internal_fields |
| 126 | + }, |
| 127 | + **{ |
| 128 | + f"unstructured.{k}": v |
| 129 | + for k, v in image["accession__unstructured_metadata__value"].items() |
| 130 | + }, |
| 131 | + } |
| 132 | + |
| 133 | + for field in Accession.computed_fields: |
| 134 | + computed_output_fields = field.transformer( |
| 135 | + image[f"accession__{field.input_field_name}"] |
| 136 | + if image.get(f"accession__{field.input_field_name}") |
| 137 | + else None |
| 138 | + ) |
| 139 | + |
| 140 | + if computed_output_fields: |
| 141 | + value.update(computed_output_fields) |
| 142 | + |
| 143 | + yield value |
139 | 144 |
|
140 | 145 |
|
141 | 146 | def image_metadata_csv(
|
@@ -167,28 +172,34 @@ def image_metadata_csv(
|
167 | 172 | fieldnames = headers + sorted(used_metadata_keys)
|
168 | 173 | yield fieldnames
|
169 | 174 |
|
170 |
| - # Note this uses .values because populating django ORM objects is very slow, and doing this on |
171 |
| - # large querysets can add ~5s per 100k images to the request time. |
172 |
| - for image in ( |
173 |
| - qs.order_by("isic_id") |
174 |
| - .values( |
175 |
| - "isic_id", |
176 |
| - "accession__cohort__attribution", |
177 |
| - "accession__copyright_license", |
178 |
| - *[f"accession__{key}" for key in Accession.metadata_keys()], |
179 |
| - *[f"accession__{field.csv_field_name}" for field in Accession.remapped_internal_fields], |
180 |
| - ) |
181 |
| - .iterator() |
182 |
| - ): |
183 |
| - image = {k.replace("accession__", ""): v for k, v in image.items()} # noqa: PLW2901 |
184 |
| - |
185 |
| - image["attribution"] = image.pop("cohort__attribution") |
186 |
| - |
187 |
| - for computed_field in Accession.computed_fields: |
188 |
| - if image[computed_field.input_field_name]: |
189 |
| - computed_fields = computed_field.transformer(image[computed_field.input_field_name]) |
190 |
| - if computed_fields: |
191 |
| - image.update(computed_fields) |
192 |
| - del image[computed_field.input_field_name] |
193 |
| - |
194 |
| - yield {k: v for k, v in image.items() if k in fieldnames} |
| 175 | + with cachalot_disabled(): |
| 176 | + # Note this uses .values because populating django ORM objects is very slow, and doing this |
| 177 | + # on large querysets can add ~5s per 100k images to the request time. |
| 178 | + for image in ( |
| 179 | + qs.order_by("isic_id") |
| 180 | + .values( |
| 181 | + "isic_id", |
| 182 | + "accession__cohort__attribution", |
| 183 | + "accession__copyright_license", |
| 184 | + *[f"accession__{key}" for key in Accession.metadata_keys()], |
| 185 | + *[ |
| 186 | + f"accession__{field.csv_field_name}" |
| 187 | + for field in Accession.remapped_internal_fields |
| 188 | + ], |
| 189 | + ) |
| 190 | + .iterator() |
| 191 | + ): |
| 192 | + image = {k.replace("accession__", ""): v for k, v in image.items()} # noqa: PLW2901 |
| 193 | + |
| 194 | + image["attribution"] = image.pop("cohort__attribution") |
| 195 | + |
| 196 | + for computed_field in Accession.computed_fields: |
| 197 | + if image[computed_field.input_field_name]: |
| 198 | + computed_fields = computed_field.transformer( |
| 199 | + image[computed_field.input_field_name] |
| 200 | + ) |
| 201 | + if computed_fields: |
| 202 | + image.update(computed_fields) |
| 203 | + del image[computed_field.input_field_name] |
| 204 | + |
| 205 | + yield {k: v for k, v in image.items() if k in fieldnames} |
0 commit comments